diff --git a/airflow/Dockerfile b/airflow/Dockerfile index 3d34a26..9a9d97e 100644 --- a/airflow/Dockerfile +++ b/airflow/Dockerfile @@ -105,7 +105,9 @@ RUN mv /usr/local/bin/pip /usr/local/bin/pip.orig && \ "gunicorn==20.1.0" \ "python-ffmpeg==2.0.12" \ "ffprobe3" \ - "python-dotenv" && \ + "python-dotenv" \ + "PyYAML" \ + "aria2p" && \ mv /usr/local/bin/pip.orig /usr/local/bin/pip # --- Install the custom yt_ops_services package --- @@ -117,6 +119,12 @@ COPY --chown=airflow:airflow yt_ops_services ./yt_ops_services/ COPY --chown=airflow:airflow thrift_model ./thrift_model/ COPY --chown=airflow:airflow pangramia ./pangramia/ +# Copy the ytops-client tool and its executable +COPY --chown=airflow:airflow ytops_client ./ytops_client/ +COPY --chown=airflow:airflow bin/ytops-client /app/bin/ytops-client +RUN chmod +x /app/bin/ytops-client +ENV PATH="/app/bin:${PATH}" + # Install the package in editable mode. This runs setup.py and installs all dependencies # listed in `install_requires`, making the `yt_ops_services` module available everywhere. # Bypass the pip root check again. diff --git a/airflow/configs/docker-compose-ytdlp-ops.yaml.j2 b/airflow/configs/docker-compose-ytdlp-ops.yaml.j2 index 171c97f..92f7b3a 100644 --- a/airflow/configs/docker-compose-ytdlp-ops.yaml.j2 +++ b/airflow/configs/docker-compose-ytdlp-ops.yaml.j2 @@ -118,14 +118,14 @@ services: - "{{ service_role }}" # --- S3 Logging Parameters --- - - "--s3-endpoint-url" - - "${S3_ENDPOINT_URL}" - - "--s3-access-key-id" - - "${S3_ACCESS_KEY_ID}" - - "--s3-secret-access-key" - - "${S3_SECRET_ACCESS_KEY}" - - "--s3-region-name" - - "${S3_REGION_NAME}" + #- "--s3-endpoint-url" + #- "${S3_ENDPOINT_URL}" + #- "--s3-access-key-id" + #- "${S3_ACCESS_KEY_ID}" + #- "--s3-secret-access-key" + #- "${S3_SECRET_ACCESS_KEY}" + #- "--s3-region-name" + #- "${S3_REGION_NAME}" {% if service_role is defined and service_role != 'management' %} # --- Parameters for worker/all-in-one roles ONLY --- - "--script-dir" diff --git a/airflow/configs/nginx.conf b/airflow/configs/nginx.conf index 106e774..9772baa 100644 --- a/airflow/configs/nginx.conf +++ b/airflow/configs/nginx.conf @@ -4,11 +4,11 @@ events { http { upstream minio_servers { - server minio:9000; + server 172.17.0.1:9001; } upstream minio_console_servers { - server minio:9001; + server 172.17.0.1:9002; } server { diff --git a/airflow/dags/ytdlp_mgmt_proxy_account.py b/airflow/dags/ytdlp_mgmt_proxy_account.py index 0e180f3..8436005 100644 --- a/airflow/dags/ytdlp_mgmt_proxy_account.py +++ b/airflow/dags/ytdlp_mgmt_proxy_account.py @@ -45,7 +45,7 @@ except ImportError as e: raise DEFAULT_MANAGEMENT_SERVICE_IP = Variable.get("MANAGEMENT_SERVICE_HOST", default_var="envoy-thrift-lb") -DEFAULT_MANAGEMENT_SERVICE_PORT = Variable.get("MANAGEMENT_SERVICE_PORT", default_var=9080) +DEFAULT_MANAGEMENT_SERVICE_PORT = Variable.get("MANAGEMENT_SERVICE_PORT", default_var=9980) DEFAULT_REDIS_CONN_ID = "redis_default" # Version tracking for debugging diff --git a/airflow/dags/ytdlp_mgmt_queues.py b/airflow/dags/ytdlp_mgmt_queues.py index 930b821..272dd0b 100644 --- a/airflow/dags/ytdlp_mgmt_queues.py +++ b/airflow/dags/ytdlp_mgmt_queues.py @@ -55,9 +55,13 @@ def _get_predefined_url_lists(): 'urls.dh128.json', 'urls.rt100.json', 'urls.rt25.json', + 'urls.rt250.json', + 'urls.rt500.json', + 'urls.rt3000.json', 'urls.sky28.json', 'urls.sky3.json', 'urls.tq46.json', + 'urls.topnews500.json', ] return ['None'] + sorted(predefined_files) @@ -256,15 +260,15 @@ def clear_queue_callable(**context): redis_conn_id = params['redis_conn_id'] queue_system = params.get('queue_system', 'v1_monolithic') + queue_base_names_to_clear = [] if queue_system == 'v1_monolithic': - queue_base_name = params['queue_base_name'] - elif queue_system == 'v2_separated_auth': - queue_base_name = 'queue2_auth' - elif queue_system == 'v2_separated_dl': - queue_base_name = 'queue2_dl' + queue_base_names_to_clear.append(params['queue_base_name']) + elif queue_system.startswith('v2_'): + # For v2, clear both auth and dl queues for a complete clear. + queue_base_names_to_clear.extend(['queue2_auth', 'queue2_dl']) else: raise ValueError(f"Invalid queue_system: {queue_system}") - logger.info(f"Operating on queue system '{queue_system}' with base name '{queue_base_name}'.") + logger.info(f"Operating on queue system '{queue_system}' with base names: {queue_base_names_to_clear}.") queues_to_clear_options = params.get('queues_to_clear_options', []) confirm_clear = params.get('confirm_clear', False) @@ -290,14 +294,15 @@ def clear_queue_callable(**context): all_suffixes = ['_inbox', '_fail', '_result', '_progress'] keys_to_delete = set() - if '_all' in queues_to_clear_options: - logger.info("'_all' option selected. Clearing all standard queues.") - for suffix in all_suffixes: - keys_to_delete.add(f"{queue_base_name}{suffix}") - else: - for suffix in queues_to_clear_options: - if suffix in all_suffixes: + for queue_base_name in queue_base_names_to_clear: + if '_all' in queues_to_clear_options: + logger.info(f"'_all' option selected. Clearing all standard queues for base '{queue_base_name}'.") + for suffix in all_suffixes: keys_to_delete.add(f"{queue_base_name}{suffix}") + else: + for suffix in queues_to_clear_options: + if suffix in all_suffixes: + keys_to_delete.add(f"{queue_base_name}{suffix}") if not keys_to_delete: logger.warning("No valid queue suffixes were selected. Nothing to delete.") diff --git a/airflow/dags/ytdlp_ops_account_maintenance.py b/airflow/dags/ytdlp_ops_account_maintenance.py index abfa8f6..38828f0 100644 --- a/airflow/dags/ytdlp_ops_account_maintenance.py +++ b/airflow/dags/ytdlp_ops_account_maintenance.py @@ -37,7 +37,7 @@ logger = logging.getLogger(__name__) # Default settings from Airflow Variables or hardcoded fallbacks DEFAULT_REDIS_CONN_ID = 'redis_default' DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1") -DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080) +DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9980) DEFAULT_ARGS = { 'owner': 'airflow', diff --git a/airflow/dags/ytdlp_ops_v01_orchestrator.py b/airflow/dags/ytdlp_ops_v01_orchestrator.py index 3ea5aa0..e295d8e 100644 --- a/airflow/dags/ytdlp_ops_v01_orchestrator.py +++ b/airflow/dags/ytdlp_ops_v01_orchestrator.py @@ -75,10 +75,10 @@ DEFAULT_REQUEST_PARAMS_JSON = """{ # Default settings DEFAULT_QUEUE_NAME = 'video_queue' DEFAULT_REDIS_CONN_ID = 'redis_default' -DEFAULT_TOTAL_WORKERS = 3 +DEFAULT_TOTAL_WORKERS = 8 DEFAULT_WORKERS_PER_BUNCH = 1 -DEFAULT_WORKER_DELAY_S = 5 -DEFAULT_BUNCH_DELAY_S = 20 +DEFAULT_WORKER_DELAY_S = 1 +DEFAULT_BUNCH_DELAY_S = 1 DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1") DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080) @@ -323,7 +323,7 @@ with DAG( # --- Worker Passthrough Parameters --- 'on_auth_failure': Param( - 'retry_with_new_account', + 'proceed_loop_under_manual_inspection', type="string", enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'proceed_loop_under_manual_inspection'], title="[Worker Param] On Authentication Failure Policy", @@ -343,38 +343,17 @@ with DAG( "'proceed_loop': (Default) Mark URL as failed but continue the processing loop with a new URL. " "'retry_with_new_token': Attempt to get a new token with a new account and retry the download once. If it fails again, proceed loop." ), - 'request_params_json': Param(DEFAULT_REQUEST_PARAMS_JSON, type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service.", render_kwargs={"rows": 20, "cols": 120}), + 'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."), 'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."), 'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."), 'clients': Param( - 'mweb,web_camoufox,tv', + 'tv_simply', type="string", enum=[ - 'mweb,web_camoufox,tv', + 'tv_simply', 'mweb', - 'web_camoufox', 'tv', 'custom', - 'tv,web_safari,mweb,web_camoufox', - 'web_safari', - 'web', - 'web_embedded', - 'web_music', - 'web_creator', - 'web_safari_camoufox', - 'web_embedded_camoufox', - 'web_music_camoufox', - 'web_creator_camoufox', - 'mweb_camoufox', - 'android', - 'android_music', - 'android_creator', - 'android_vr', - 'ios', - 'ios_music', - 'ios_creator', - 'tv_simply', - 'tv_embedded', ], title="[Worker Param] Clients", description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details." @@ -402,27 +381,24 @@ with DAG( type="string", enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'], title="[Worker Param] Download Format Preset", - description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformats_0: 18,140\nformats_2: 18,140,299/298/137/136/135/134/133\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318" + description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformats_0: 18,140\nformats_2: 18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318" ), 'download_format_custom': Param( - '18,140,299/298/137/136/135/134/133', + '18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy', type="string", title="[Worker Param] Custom Download Format", description="Custom yt-dlp format string. Used when preset is 'custom'. E.g., 'ba[ext=m4a]/bestaudio/best'." ), 'downloader': Param( - 'default', + 'cli', type="string", - enum=['default', 'aria2c'], - title="[Worker Param] Downloader", - description="Choose the downloader for yt-dlp." - ), - 'downloader_args_aria2c': Param( - 'aria2c:-x 4 -k 2M --max-download-limit=3M', - type="string", - title="[Worker Param] Aria2c Downloader Arguments", - description="Arguments to pass to yt-dlp's --downloader-args. Used when downloader is 'aria2c'." + enum=['py', 'aria-rpc', 'cli'], + title="[Worker Param] Download Tool", + description="Choose the download tool to use: 'py' (native python, recommended), 'aria-rpc' (send to aria2c daemon), 'cli' (legacy yt-dlp wrapper)." ), + 'aria_host': Param('172.17.0.1', type="string", title="[Worker Param] Aria2c Host", description="For 'aria-rpc' downloader: Host of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_HOST'."), + 'aria_port': Param(6800, type="integer", title="[Worker Param] Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_PORT'."), + 'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="[Worker Param] Aria2c Secret", description="For 'aria-rpc' downloader: Secret token. Can be set via Airflow Variable 'YTDLP_ARIA_SECRET'."), 'yt_dlp_extra_args': Param( '--restrict-filenames', type=["string", "null"], diff --git a/airflow/dags/ytdlp_ops_v01_worker_per_url.py b/airflow/dags/ytdlp_ops_v01_worker_per_url.py index 9c01451..6d03b0e 100644 --- a/airflow/dags/ytdlp_ops_v01_worker_per_url.py +++ b/airflow/dags/ytdlp_ops_v01_worker_per_url.py @@ -290,7 +290,10 @@ def get_url_and_assign_account(**context): @task def get_token(initial_data: dict, **context): - """Makes a single attempt to get a token from the Thrift service.""" + """Makes a single attempt to get a token by calling the ytops-client get-info tool.""" + import subprocess + import shlex + ti = context['task_instance'] params = context['params'] @@ -298,129 +301,85 @@ def get_token(initial_data: dict, **context): url = initial_data['url_to_process'] info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') - host, port, timeout = params['service_ip'], int(params['service_port']), int(params.get('timeout', DEFAULT_TIMEOUT)) + host, port = params['service_ip'], int(params['service_port']) machine_id = params.get('machine_id') or socket.gethostname() clients = params.get('clients') request_params_json = params.get('request_params_json', '{}') assigned_proxy_url = params.get('assigned_proxy_url') - # Pretty-print the request parameters for debugging - try: - pretty_request_params = json.dumps(json.loads(request_params_json), indent=2) - logger.info(f"\n--- Request Parameters ---\n{pretty_request_params}\n--- End of Request Parameters ---") - except (json.JSONDecodeError, TypeError): - logger.warning("Could not parse request_params_json. Using raw content.") - logger.info(f"\n--- Raw Request Parameters ---\n{request_params_json}\n--- End of Raw Request Parameters ---") + video_id = _extract_video_id(url) + os.makedirs(info_json_dir, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + info_json_path = os.path.join(info_json_dir, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json") - # Construct Airflow log context to pass to the service - try: - from airflow.configuration import conf - remote_base = conf.get('logging', 'remote_base_log_folder') - log_path = ( - f"{remote_base}/dag_id={ti.dag_id}/run_id={ti.run_id}/" - f"task_id={ti.task_id}/attempt={ti.try_number}.log" - ) - airflow_log_context = AirflowLogContext( - logS3Path=log_path, - dagId=ti.dag_id, - runId=ti.run_id, - taskId=ti.task_id, - tryNumber=ti.try_number, - workerHostname=socket.gethostname(), - queue=ti.queue - ) - logger.info(f"Constructed Airflow log context for yt-ops service: {airflow_log_context}") - except Exception as e: - logger.warning(f"Could not construct full Airflow log context: {e}. Creating a basic one.") - airflow_log_context = AirflowLogContext( - dagId=ti.dag_id, - runId=ti.run_id, - taskId=ti.task_id, - tryNumber=ti.try_number, - workerHostname=socket.gethostname(), - queue=ti.queue - ) + cmd = [ + 'ytops-client', 'get-info', + '--host', host, + '--port', str(port), + '--profile', account_id, + '--output', info_json_path, + '--print-proxy', + '--verbose', + '--log-return', + ] + + if clients: + cmd.extend(['--client', clients]) + if machine_id: + cmd.extend(['--machine-id', machine_id]) + if request_params_json and request_params_json != '{}': + cmd.extend(['--request-params-json', request_params_json]) + if assigned_proxy_url: + cmd.extend(['--assigned-proxy-url', assigned_proxy_url]) + cmd.append(url) + logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' (Clients: {clients}) ---") - client, transport = None, None - try: - client, transport = _get_thrift_client(host, port, timeout) - token_data = client.getOrRefreshToken( - accountId=account_id, - updateType=TokenUpdateMode.AUTO, - url=url, - clients=clients, - machineId=machine_id, - airflowLogContext=airflow_log_context, - requestParamsJson=request_params_json, - assignedProxyUrl=assigned_proxy_url - ) + copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd) + logger.info(f"Executing command: {copy_paste_cmd}") - # Log a compact summary of the Thrift response, omitting large/detailed fields. - summary_token_data = copy(token_data) - if hasattr(summary_token_data, 'infoJson') and summary_token_data.infoJson: - summary_token_data.infoJson = f"... ({len(summary_token_data.infoJson)} bytes) ..." - if hasattr(summary_token_data, 'cookiesBlob') and summary_token_data.cookiesBlob: - summary_token_data.cookiesBlob = f"... ({len(summary_token_data.cookiesBlob)} bytes) ..." - # These will be logged separately below. - if hasattr(summary_token_data, 'requestSummary'): - summary_token_data.requestSummary = "..." - if hasattr(summary_token_data, 'communicationLogs'): - summary_token_data.communicationLogs = "..." - logger.info(f"Thrift service response summary: {summary_token_data}") + process = subprocess.run(cmd, capture_output=True, text=True, timeout=int(params.get('timeout', DEFAULT_TIMEOUT))) - request_summary = getattr(token_data, 'requestSummary', None) - if request_summary: - # Prepending a newline for better separation in logs. - logger.info(f"\n--- Request Summary ---\n{request_summary}") + if process.stdout: + logger.info(f"ytops-client STDOUT:\n{process.stdout}") + if process.stderr: + logger.info(f"ytops-client STDERR:\n{process.stderr}") - communication_logs = getattr(token_data, 'communicationLogs', None) - if communication_logs: - logger.info("--- Communication Logs from Token Service ---") - logger.info(communication_logs) - logger.info("--- End of Communication Logs ---") - - info_json = getattr(token_data, 'infoJson', None) - if not (info_json and json.loads(info_json)): - raise AirflowException("Service returned success but info.json was empty or invalid.") - - video_id = _extract_video_id(url) - os.makedirs(info_json_dir, exist_ok=True) - # Use a readable timestamp for a unique filename on each attempt. - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - info_json_path = os.path.join(info_json_dir, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json") - with open(info_json_path, 'w', encoding='utf-8') as f: - f.write(info_json) - - proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None) - ytdlp_command = getattr(token_data, 'ytdlpCommand', None) - if ytdlp_command: - logger.info(f"--- YTDLP Command from Token Service ---\n{ytdlp_command}\n--- End of YTDLP Command ---") - - return { - 'info_json_path': info_json_path, - 'socks_proxy': getattr(token_data, proxy_attr) if proxy_attr else None, - 'ytdlp_command': ytdlp_command, - 'successful_account_id': account_id, - 'original_url': url, # Include original URL for fallback - } - except (PBServiceException, PBUserException, TTransportException) as e: - error_context = getattr(e, 'context', None) - if isinstance(error_context, str): - try: error_context = json.loads(error_context.replace("'", "\"")) - except: pass + if process.returncode != 0: + error_message = "ytops-client failed. See logs for details." + for line in reversed(process.stderr.strip().split('\n')): + if 'ERROR' in line or 'Thrift error' in line or 'Connection to server failed' in line: + error_message = line.strip() + break + error_code = 'GET_INFO_CLIENT_FAIL' + if "BOT_DETECTED" in process.stderr: + error_code = "BOT_DETECTED" + elif "BOT_DETECTION_SIGN_IN_REQUIRED" in process.stderr: + error_code = "BOT_DETECTION_SIGN_IN_REQUIRED" + elif "Connection to server failed" in process.stderr: + error_code = "TRANSPORT_ERROR" + error_details = { - 'error_message': getattr(e, 'message', str(e)), - 'error_code': getattr(e, 'errorCode', 'TRANSPORT_ERROR'), - 'proxy_url': error_context.get('proxy_url') if isinstance(error_context, dict) else None + 'error_message': error_message, + 'error_code': error_code, + 'proxy_url': None } - logger.error(f"Thrift call failed for account '{account_id}'. Details: {error_details}") ti.xcom_push(key='error_details', value=error_details) - raise AirflowException(f"Thrift call failed: {error_details['error_message']}") - finally: - if transport and transport.isOpen(): - transport.close() + raise AirflowException(f"ytops-client get-info failed: {error_message}") + + proxy = None + proxy_match = re.search(r"Proxy used: (.*)", process.stderr) + if proxy_match: + proxy = proxy_match.group(1).strip() + + return { + 'info_json_path': info_json_path, + 'socks_proxy': proxy, + 'ytdlp_command': None, + 'successful_account_id': account_id, + 'original_url': url, + } @task.branch def handle_bannable_error_branch(task_id_to_check: str, **context): @@ -706,7 +665,7 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context elif format_preset == 'formats_0': download_format = '18,140' elif format_preset == 'formats_2': - download_format = '18,140,299/298/137/136/135/134/133' + download_format = '18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy' elif format_preset == 'formats_3': download_format = '18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318' else: @@ -720,112 +679,102 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context raise AirflowException(f"Error: info.json path is missing or file does not exist ({info_json_path}).") def run_yt_dlp_command(format_selector: str): - """Constructs and runs a yt-dlp command, returning a list of final filenames.""" - cmd = [ - 'yt-dlp', '--verbose', '--print-traffic', '--load-info-json', info_json_path, - '-f', format_selector, '-o', full_output_path, - '--print', 'filename', '--continue', '--no-progress', '--no-simulate', - '--no-write-info-json', '--ignore-errors', '--no-playlist', - ] - - if params.get('fragment_retries'): - cmd.extend(['--fragment-retries', str(params['fragment_retries'])]) - if params.get('limit_rate'): - cmd.extend(['--limit-rate', params['limit_rate']]) - if params.get('socket_timeout'): - cmd.extend(['--socket-timeout', str(params['socket_timeout'])]) - if params.get('min_sleep_interval'): - cmd.extend(['--min-sleep-interval', str(params['min_sleep_interval'])]) - if params.get('max_sleep_interval'): - cmd.extend(['--max-sleep-interval', str(params['max_sleep_interval'])]) - if params.get('yt_dlp_test_mode'): - cmd.append('--test') + """Constructs and runs a yt-ops-client download command, returning a list of final filenames.""" + downloader = params.get('downloader', 'py') + cmd = ['ytops-client', 'download', downloader, '--load-info-json', info_json_path, '-f', format_selector] - downloader = params.get('downloader', 'default') - if proxy and not (downloader == 'aria2c' and proxy.startswith('socks5://')): + if proxy: cmd.extend(['--proxy', proxy]) - gost_process = None - try: - if downloader == 'aria2c': - cmd.extend(['--downloader', 'aria2c']) - downloader_args = params.get('downloader_args_aria2c') - if proxy and proxy.startswith('socks5://'): - import socket - from contextlib import closing - def find_free_port(): - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: - s.bind(('', 0)) - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - return s.getsockname()[1] - local_port = find_free_port() - http_proxy = f"http://127.0.0.1:{local_port}" - logger.info(f"Starting gost for format '{format_selector}' to forward {proxy} to {http_proxy}") - gost_cmd = ['gost', '-L', f'http://127.0.0.1:{local_port}', '-F', proxy] - gost_process = subprocess.Popen(gost_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - time.sleep(1) - if gost_process.poll() is not None: - stdout, stderr = gost_process.communicate() - logger.error(f"gost failed to start. Exit: {gost_process.returncode}. Stdout: {stdout.decode()}. Stderr: {stderr.decode()}") - raise AirflowException("gost proxy tunnel failed to start.") - user_args = downloader_args[len('aria2c:'):] if downloader_args and downloader_args.startswith('aria2c:') else (downloader_args or "") - final_args_str = f'aria2c:{user_args.strip()} --http-proxy={http_proxy}' - cmd.extend(['--downloader-args', final_args_str]) - elif downloader_args: - cmd.extend(['--downloader-args', downloader_args]) + if downloader == 'py': + cmd.extend(['--output-dir', download_dir]) + # The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args + py_extra_args = [] + if params.get('fragment_retries'): + py_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])]) + if params.get('limit_rate'): + py_extra_args.extend(['--limit-rate', params['limit_rate']]) + if params.get('socket_timeout'): + py_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])]) + if params.get('min_sleep_interval'): + py_extra_args.extend(['--sleep-interval', str(params['min_sleep_interval'])]) + if params.get('max_sleep_interval'): + py_extra_args.extend(['--max-sleep-interval', str(params['max_sleep_interval'])]) + if params.get('yt_dlp_test_mode'): + py_extra_args.append('--test') - extra_args = params.get('yt_dlp_extra_args') - if extra_args: - cmd.extend(shlex.split(extra_args)) - if original_url: - cmd.append(original_url) + existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '') + final_extra_args = existing_extra + py_extra_args + if final_extra_args: + cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)]) - copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd) - logger.info(f"Executing yt-dlp command for format '{format_selector}': {copy_paste_cmd}") - process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600) - - if process.stdout: - logger.info(f"yt-dlp STDOUT for format '{format_selector}':\n{process.stdout}") - if process.stderr: - # yt-dlp often prints progress and informational messages to stderr - logger.info(f"yt-dlp STDERR for format '{format_selector}':\n{process.stderr}") + elif downloader == 'aria-rpc': + cmd.extend([ + '--aria-host', params.get('aria_host', '172.17.0.1'), + '--aria-port', str(params.get('aria_port', 6800)), + '--aria-secret', params.get('aria_secret'), + '--wait', '--auto-merge-fragments', + '--fragments-dir', download_dir, + '--output-dir', download_dir, + ]) + if params.get('yt_dlp_cleanup_mode'): + cmd.append('--cleanup') - if process.returncode != 0: - logger.error(f"yt-dlp failed for format '{format_selector}' with exit code {process.returncode}") - # STDOUT and STDERR are already logged above. - raise AirflowException(f"yt-dlp command failed for format '{format_selector}'. {process.stderr}") + elif downloader == 'cli': + cmd.extend(['--output-dir', download_dir]) + # The 'cli' tool is the old yt-dlp wrapper, so it takes similar arguments. + cli_extra_args = [] + if params.get('fragment_retries'): + cli_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])]) + if params.get('limit_rate'): + cli_extra_args.extend(['--limit-rate', params['limit_rate']]) + if params.get('socket_timeout'): + cli_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])]) + if params.get('min_sleep_interval'): + cli_extra_args.extend(['--sleep-interval', str(params['min_sleep_interval'])]) + if params.get('max_sleep_interval'): + cli_extra_args.extend(['--max-sleep-interval', str(params['max_sleep_interval'])]) + if params.get('yt_dlp_test_mode'): + cli_extra_args.append('--test') - # In test mode, files are not created, so we only check that yt-dlp returned filenames. - # Otherwise, we verify that the files actually exist on disk. - output_files = [f for f in process.stdout.strip().split('\n') if f] - if not params.get('yt_dlp_test_mode'): - output_files = [f for f in output_files if os.path.exists(f)] + existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '') + final_extra_args = existing_extra + cli_extra_args + if final_extra_args: + cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)]) - if not output_files: - log_msg = (f"Test run for format '{format_selector}' did not produce any filenames." - if params.get('yt_dlp_test_mode') else - f"Download for format '{format_selector}' finished but no output files exist.") - exc_msg = (f"Test run for format '{format_selector}' did not produce any filenames." - if params.get('yt_dlp_test_mode') else - f"Download for format '{format_selector}' did not produce a file.") - - logger.error(log_msg) - logger.error(f"Full STDOUT:\n{process.stdout}") - logger.error(f"Full STDERR:\n{process.stderr}") - raise AirflowException(exc_msg) - - log_prefix = "SUCCESS (Test Mode):" if params.get('yt_dlp_test_mode') else "SUCCESS:" - logger.info(f"{log_prefix} Command for format '{format_selector}' complete. Files: {output_files}") - return output_files - finally: - if gost_process: - logger.info(f"Terminating gost process (PID: {gost_process.pid}) for format '{format_selector}'.") - gost_process.terminate() - try: - gost_process.wait(timeout=5) - except subprocess.TimeoutExpired: - gost_process.kill() - gost_process.wait() + copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd) + logger.info(f"Executing download command for format '{format_selector}': {copy_paste_cmd}") + process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600) + + if process.stdout: + logger.info(f"Download tool STDOUT for format '{format_selector}':\n{process.stdout}") + if process.stderr: + logger.info(f"Download tool STDERR for format '{format_selector}':\n{process.stderr}") + + if process.returncode != 0: + logger.error(f"Download tool failed for format '{format_selector}' with exit code {process.returncode}") + raise AirflowException(f"Download command failed for format '{format_selector}'. See logs for details.") + + output_files = [] + for line in process.stdout.strip().split('\n'): + # For aria-rpc, parse "Download and merge successful: " or "Download successful: " + match = re.search(r'successful: (.+)', line) + if match: + filepath = match.group(1).strip() + if os.path.exists(filepath): + output_files.append(filepath) + else: + logger.warning(f"File path from aria-rpc output does not exist locally: '{filepath}'") + # For py/cli, it's just the path + elif os.path.exists(line.strip()): + output_files.append(line.strip()) + + if not params.get('yt_dlp_test_mode') and not output_files: + raise AirflowException(f"Download for format '{format_selector}' finished but no output files were found or exist.") + + log_prefix = "SUCCESS (Test Mode):" if params.get('yt_dlp_test_mode') else "SUCCESS:" + logger.info(f"{log_prefix} Command for format '{format_selector}' complete. Files: {output_files}") + return output_files def run_ffmpeg_probe(filename): """Probes a file with ffmpeg to check for corruption.""" @@ -1512,7 +1461,7 @@ with DAG( 'prepend_client_to_account': Param(True, type="boolean", title="[Worker Param] Prepend Client to Account", description="If True, prepends client and timestamp to account names in prefix mode."), 'machine_id': Param(None, type=["string", "null"]), 'assigned_proxy_url': Param(None, type=["string", "null"], title="[Worker Param] Assigned Proxy URL", description="A specific proxy URL to use for the request, overriding the server's proxy pool logic."), - 'clients': Param('mweb,web_camoufox,tv', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"), + 'clients': Param('tv_simply', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"), 'timeout': Param(DEFAULT_TIMEOUT, type="integer"), 'output_path_template': Param("%(title)s [%(id)s].f%(format_id)s.%(ext)s", type="string", title="[Worker Param] Output Path Template", description="Output filename template for yt-dlp. It is highly recommended to include `%(format_id)s` to prevent filename collisions when downloading multiple formats."), 'on_auth_failure': Param( @@ -1542,11 +1491,11 @@ with DAG( 'min_sleep_interval': Param(5, type="integer", title="[Worker Param] Min Sleep Interval", description="Minimum time to sleep between downloads (seconds)."), 'max_sleep_interval': Param(10, type="integer", title="[Worker Param] Max Sleep Interval", description="Maximum time to sleep between downloads (seconds)."), 'download_format_preset': Param( - 'custom', + 'formats_2', type="string", enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'], title="Download Format Preset", - description="Select a predefined format string or choose 'custom'. To download multiple formats, this should be a comma-separated list of format IDs (e.g., '137,140').\nformats_0: 18,140\nformats_2: 18,140,299/298/137/136/135/134/133\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318" + description="Select a predefined format string or choose 'custom'. To download multiple formats, this should be a comma-separated list of format IDs (e.g., '137,140').\nformats_0: 18,140\nformats_2: 18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318" ), 'download_format_custom': Param( '18,140,299/298/137/136/135/134/133', @@ -1555,18 +1504,15 @@ with DAG( description="Custom yt-dlp format string. Used when preset is 'custom'. To download multiple formats, provide a comma-separated list of format IDs (e.g., '137,140')." ), 'downloader': Param( - 'default', + 'cli', type="string", - enum=['default', 'aria2c'], - title="Downloader", - description="Choose the downloader for yt-dlp." - ), - 'downloader_args_aria2c': Param( - 'aria2c:-x 4 -k 2M --max-download-limit=3M', - type="string", - title="Aria2c Downloader Arguments", - description="Arguments to pass to yt-dlp's --downloader-args. Used when downloader is 'aria2c'." + enum=['py', 'aria-rpc', 'cli'], + title="Download Tool", + description="Choose the download tool to use: 'py' (native python, recommended), 'aria-rpc' (send to aria2c daemon), 'cli' (legacy yt-dlp wrapper)." ), + 'aria_host': Param('172.17.0.1', type="string", title="Aria2c Host", description="For 'aria-rpc' downloader: Host of the aria2c RPC server."), + 'aria_port': Param(6800, type="integer", title="Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server."), + 'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="Aria2c Secret", description="For 'aria-rpc' downloader: Secret token."), 'yt_dlp_extra_args': Param( '', type=["string", "null"], diff --git a/airflow/dags/ytdlp_ops_v02_orchestrator_auth.py b/airflow/dags/ytdlp_ops_v02_orchestrator_auth.py index 65432ea..ac7e352 100644 --- a/airflow/dags/ytdlp_ops_v02_orchestrator_auth.py +++ b/airflow/dags/ytdlp_ops_v02_orchestrator_auth.py @@ -72,10 +72,10 @@ DEFAULT_REQUEST_PARAMS_JSON = """{ # Default settings DEFAULT_REDIS_CONN_ID = 'redis_default' -DEFAULT_TOTAL_WORKERS = 3 +DEFAULT_TOTAL_WORKERS = 8 DEFAULT_WORKERS_PER_BUNCH = 1 -DEFAULT_WORKER_DELAY_S = 5 -DEFAULT_BUNCH_DELAY_S = 20 +DEFAULT_WORKER_DELAY_S = 1 +DEFAULT_BUNCH_DELAY_S = 1 DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1") DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080) @@ -283,7 +283,7 @@ with DAG( # --- Worker Passthrough Parameters --- 'on_bannable_failure': Param( - 'stop_loop_on_auth_proceed_on_download_error', + 'proceed_loop_under_manual_inspection', type="string", enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error', 'proceed_loop_under_manual_inspection', 'stop_loop_on_auth_proceed_on_download_error'], title="[Worker Param] On Bannable Failure Policy", @@ -294,37 +294,16 @@ with DAG( "'proceed_loop_under_manual_inspection': **BEWARE: MANUAL SUPERVISION REQUIRED.** Marks the URL as failed but continues the processing loop. Use this only when you can manually intervene by pausing the dispatcher DAG or creating a lock file (`/opt/airflow/inputfiles/AIRFLOW.PREVENT_URL_PULL.lockfile`) to prevent a runaway failure loop." "'stop_loop_on_auth_proceed_on_download_error': **(Default)** Stops the loop on an authentication/token error (like 'stop_loop'), but continues the loop on a download/probe error (like 'proceed...')." ), - 'request_params_json': Param(DEFAULT_REQUEST_PARAMS_JSON, type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service.", render_kwargs={"rows": 20, "cols": 120}), + 'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."), 'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."), 'clients': Param( - 'mweb,web_camoufox,tv', + 'tv_simply', type="string", enum=[ - 'mweb,web_camoufox,tv', + 'tv_simply', 'mweb', - 'web_camoufox', 'tv', 'custom', - 'tv,web_safari,mweb,web_camoufox', - 'web_safari', - 'web', - 'web_embedded', - 'web_music', - 'web_creator', - 'web_safari_camoufox', - 'web_embedded_camoufox', - 'web_music_camoufox', - 'web_creator_camoufox', - 'mweb_camoufox', - 'android', - 'android_music', - 'android_creator', - 'android_vr', - 'ios', - 'ios_music', - 'ios_creator', - 'tv_simply', - 'tv_embedded', ], title="[Worker Param] Clients", description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details." diff --git a/airflow/dags/ytdlp_ops_v02_orchestrator_dl.py b/airflow/dags/ytdlp_ops_v02_orchestrator_dl.py index df5e834..1c56efe 100644 --- a/airflow/dags/ytdlp_ops_v02_orchestrator_dl.py +++ b/airflow/dags/ytdlp_ops_v02_orchestrator_dl.py @@ -37,10 +37,10 @@ logger = logging.getLogger(__name__) # Default settings DEFAULT_REDIS_CONN_ID = 'redis_default' -DEFAULT_TOTAL_WORKERS = 3 +DEFAULT_TOTAL_WORKERS = 8 DEFAULT_WORKERS_PER_BUNCH = 1 -DEFAULT_WORKER_DELAY_S = 5 -DEFAULT_BUNCH_DELAY_S = 20 +DEFAULT_WORKER_DELAY_S = 1 +DEFAULT_BUNCH_DELAY_S = 1 # --- Helper Functions --- @@ -260,27 +260,24 @@ with DAG( type="string", enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'], title="[Worker Param] Download Format Preset", - description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformats_0: 18,140\nformats_2: 18,140,299/298/137/136/135/134/133\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318" + description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformats_0: 18,140\nformats_2: 18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318" ), 'download_format_custom': Param( - '18,140,299/298/137/136/135/134/133', + '18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy', type="string", title="[Worker Param] Custom Download Format", description="Custom yt-dlp format string. Used when preset is 'custom'. E.g., 'ba[ext=m4a]/bestaudio/best'." ), 'downloader': Param( - 'default', + 'cli', type="string", - enum=['default', 'aria2c'], - title="[Worker Param] Downloader", - description="Choose the downloader for yt-dlp." - ), - 'downloader_args_aria2c': Param( - 'aria2c:-x 4 -k 2M --max-download-limit=3M', - type="string", - title="[Worker Param] Aria2c Downloader Arguments", - description="Arguments to pass to yt-dlp's --downloader-args. Used when downloader is 'aria2c'." + enum=['py', 'aria-rpc', 'cli'], + title="[Worker Param] Download Tool", + description="Choose the download tool to use: 'py' (native python, recommended), 'aria-rpc' (send to aria2c daemon), 'cli' (legacy yt-dlp wrapper)." ), + 'aria_host': Param('172.17.0.1', type="string", title="[Worker Param] Aria2c Host", description="For 'aria-rpc' downloader: Host of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_HOST'."), + 'aria_port': Param(6800, type="integer", title="[Worker Param] Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_PORT'."), + 'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="[Worker Param] Aria2c Secret", description="For 'aria-rpc' downloader: Secret token. Can be set via Airflow Variable 'YTDLP_ARIA_SECRET'."), 'yt_dlp_extra_args': Param( '--restrict-filenames', type=["string", "null"], diff --git a/airflow/dags/ytdlp_ops_v02_worker_per_url_auth.py b/airflow/dags/ytdlp_ops_v02_worker_per_url_auth.py index 1939821..82e6cc9 100644 --- a/airflow/dags/ytdlp_ops_v02_worker_per_url_auth.py +++ b/airflow/dags/ytdlp_ops_v02_worker_per_url_auth.py @@ -380,7 +380,10 @@ def get_url_and_assign_account(**context): @task def get_token(initial_data: dict, **context): - """Makes a single attempt to get a token from the Thrift service.""" + """Makes a single attempt to get a token by calling the ytops-client get-info tool.""" + import subprocess + import shlex + ti = context['task_instance'] params = context['params'] @@ -388,131 +391,89 @@ def get_token(initial_data: dict, **context): url = initial_data['url_to_process'] info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') - host, port, timeout = params['service_ip'], int(params['service_port']), int(params.get('timeout', DEFAULT_TIMEOUT)) + host, port = params['service_ip'], int(params['service_port']) machine_id = params.get('machine_id') or socket.gethostname() clients = params.get('clients') request_params_json = params.get('request_params_json', '{}') assigned_proxy_url = params.get('assigned_proxy_url') - # Pretty-print the request parameters for debugging - try: - pretty_request_params = json.dumps(json.loads(request_params_json), indent=2) - logger.info(f"\n--- Request Parameters ---\n{pretty_request_params}\n--- End of Request Parameters ---") - except (json.JSONDecodeError, TypeError): - logger.warning("Could not parse request_params_json. Using raw content.") - logger.info(f"\n--- Raw Request Parameters ---\n{request_params_json}\n--- End of Raw Request Parameters ---") + video_id = _extract_video_id(url) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + job_dir_name = f"{timestamp}-{video_id or 'unknown'}" + job_dir_path = os.path.join(info_json_dir, job_dir_name) + os.makedirs(job_dir_path, exist_ok=True) + info_json_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json" + info_json_path = os.path.join(job_dir_path, info_json_filename) - # Construct Airflow log context to pass to the service - try: - from airflow.configuration import conf - remote_base = conf.get('logging', 'remote_base_log_folder') - log_path = ( - f"{remote_base}/dag_id={ti.dag_id}/run_id={ti.run_id}/" - f"task_id={ti.task_id}/attempt={ti.try_number}.log" - ) - airflow_log_context = AirflowLogContext( - logS3Path=log_path, - dagId=ti.dag_id, - runId=ti.run_id, - taskId=ti.task_id, - tryNumber=ti.try_number, - workerHostname=socket.gethostname(), - queue=ti.queue - ) - logger.info(f"Constructed Airflow log context for yt-ops service: {airflow_log_context}") - except Exception as e: - logger.warning(f"Could not construct full Airflow log context: {e}. Creating a basic one.") - airflow_log_context = AirflowLogContext( - dagId=ti.dag_id, - runId=ti.run_id, - taskId=ti.task_id, - tryNumber=ti.try_number, - workerHostname=socket.gethostname(), - queue=ti.queue - ) + cmd = [ + 'ytops-client', 'get-info', + '--host', host, + '--port', str(port), + '--profile', account_id, + '--output', info_json_path, + '--print-proxy', + '--verbose', + '--log-return', + ] + + if clients: + cmd.extend(['--client', clients]) + if machine_id: + cmd.extend(['--machine-id', machine_id]) + if request_params_json and request_params_json != '{}': + cmd.extend(['--request-params-json', request_params_json]) + if assigned_proxy_url: + cmd.extend(['--assigned-proxy-url', assigned_proxy_url]) + cmd.append(url) + logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' (Clients: {clients}) ---") - client, transport = None, None - try: - client, transport = _get_thrift_client(host, port, timeout) - token_data = client.getOrRefreshToken( - accountId=account_id, - updateType=TokenUpdateMode.AUTO, - url=url, - clients=clients, - machineId=machine_id, - airflowLogContext=airflow_log_context, - requestParamsJson=request_params_json, - assignedProxyUrl=assigned_proxy_url - ) + copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd) + logger.info(f"Executing command: {copy_paste_cmd}") - # Log a compact summary of the Thrift response, omitting large/detailed fields. - summary_token_data = copy(token_data) - if hasattr(summary_token_data, 'infoJson') and summary_token_data.infoJson: - summary_token_data.infoJson = f"... ({len(summary_token_data.infoJson)} bytes) ..." - if hasattr(summary_token_data, 'cookiesBlob') and summary_token_data.cookiesBlob: - summary_token_data.cookiesBlob = f"... ({len(summary_token_data.cookiesBlob)} bytes) ..." - # These will be logged separately below. - if hasattr(summary_token_data, 'requestSummary'): - summary_token_data.requestSummary = "..." - if hasattr(summary_token_data, 'communicationLogPaths'): - summary_token_data.communicationLogPaths = "..." - logger.info(f"Thrift service response summary: {summary_token_data}") + process = subprocess.run(cmd, capture_output=True, text=True, timeout=int(params.get('timeout', DEFAULT_TIMEOUT))) - request_summary = getattr(token_data, 'requestSummary', None) - if request_summary: - # Prepending a newline for better separation in logs. - logger.info(f"\n--- Request Summary ---\n{request_summary}") + if process.stdout: + logger.info(f"ytops-client STDOUT:\n{process.stdout}") + if process.stderr: + logger.info(f"ytops-client STDERR:\n{process.stderr}") - communication_log_paths = getattr(token_data, 'communicationLogPaths', None) - if communication_log_paths: - logger.info("--- Communication Log Paths ---") - for path in communication_log_paths: - logger.info(f" - {path}") - - info_json = getattr(token_data, 'infoJson', None) - if not (info_json and json.loads(info_json)): - raise AirflowException("Service returned success but info.json was empty or invalid.") - - video_id = _extract_video_id(url) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - - # Create a unique directory for this job's artifacts - job_dir_name = f"{timestamp}-{video_id or 'unknown'}" - job_dir_path = os.path.join(info_json_dir, job_dir_name) - os.makedirs(job_dir_path, exist_ok=True) - - info_json_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json" - info_json_path = os.path.join(job_dir_path, info_json_filename) - with open(info_json_path, 'w', encoding='utf-8') as f: - f.write(info_json) - - proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None) - return { - 'info_json_path': info_json_path, - 'socks_proxy': getattr(token_data, proxy_attr) if proxy_attr else None, - 'ytdlp_command': getattr(token_data, 'ytdlpCommand', None), - 'successful_account_id': account_id, - 'original_url': url, # Include original URL for fallback - 'clients': clients, # Pass clients string for accurate stats - } - except (PBServiceException, PBUserException, TTransportException) as e: - error_context = getattr(e, 'context', None) - if isinstance(error_context, str): - try: error_context = json.loads(error_context.replace("'", "\"")) - except: pass + if process.returncode != 0: + error_message = "ytops-client failed. See logs for details." + for line in reversed(process.stderr.strip().split('\n')): + if 'ERROR' in line or 'Thrift error' in line or 'Connection to server failed' in line: + error_message = line.strip() + break + error_code = 'GET_INFO_CLIENT_FAIL' + if "BOT_DETECTED" in process.stderr: + error_code = "BOT_DETECTED" + elif "BOT_DETECTION_SIGN_IN_REQUIRED" in process.stderr: + error_code = "BOT_DETECTION_SIGN_IN_REQUIRED" + elif "Connection to server failed" in process.stderr: + error_code = "TRANSPORT_ERROR" + error_details = { - 'error_message': getattr(e, 'message', str(e)), - 'error_code': getattr(e, 'errorCode', 'TRANSPORT_ERROR'), - 'proxy_url': error_context.get('proxy_url') if isinstance(error_context, dict) else None + 'error_message': error_message, + 'error_code': error_code, + 'proxy_url': None } - logger.error(f"Thrift call failed for account '{account_id}'. Exception: {error_details['error_message']}") ti.xcom_push(key='error_details', value=error_details) - raise AirflowException(f"Thrift call failed: {error_details['error_message']}") - finally: - if transport and transport.isOpen(): - transport.close() + raise AirflowException(f"ytops-client get-info failed: {error_message}") + + proxy = None + proxy_match = re.search(r"Proxy used: (.*)", process.stderr) + if proxy_match: + proxy = proxy_match.group(1).strip() + + return { + 'info_json_path': info_json_path, + 'socks_proxy': proxy, + 'ytdlp_command': None, + 'successful_account_id': account_id, + 'original_url': url, + 'clients': clients, + } @task.branch def handle_bannable_error_branch(task_id_to_check: str, **context): @@ -1135,7 +1096,7 @@ with DAG( 'prepend_client_to_account': Param(True, type="boolean", title="[Worker Param] Prepend Client to Account", description="If True, prepends client and timestamp to account names in prefix mode."), 'machine_id': Param(None, type=["string", "null"]), 'assigned_proxy_url': Param(None, type=["string", "null"], title="[Worker Param] Assigned Proxy URL", description="If provided, forces the token service to use this specific proxy for the request."), - 'clients': Param('mweb', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"), + 'clients': Param('tv_simply', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"), 'timeout': Param(DEFAULT_TIMEOUT, type="integer"), 'on_bannable_failure': Param('stop_loop_on_auth_proceed_on_download_error', type="string", enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error', 'proceed_loop_under_manual_inspection', 'stop_loop_on_auth_proceed_on_download_error']), 'request_params_json': Param(json.dumps(DEFAULT_REQUEST_PARAMS), type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."), diff --git a/airflow/dags/ytdlp_ops_v02_worker_per_url_dl.py b/airflow/dags/ytdlp_ops_v02_worker_per_url_dl.py index 68605bf..5fb0bb8 100644 --- a/airflow/dags/ytdlp_ops_v02_worker_per_url_dl.py +++ b/airflow/dags/ytdlp_ops_v02_worker_per_url_dl.py @@ -300,7 +300,7 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context elif format_preset == 'formats_0': download_format = '18,140' elif format_preset == 'formats_2': - download_format = '18,140,299/298/137/136/135/134/133' + download_format = '18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy' elif format_preset == 'formats_3': download_format = '18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318' else: @@ -311,112 +311,102 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context retry_on_probe_failure = params.get('retry_on_probe_failure', False) def run_yt_dlp_command(format_selector: str): - """Constructs and runs a yt-dlp command, returning a list of final filenames.""" - cmd = [ - 'yt-dlp', '--verbose', '--print-traffic', '--load-info-json', info_json_path, - '-f', format_selector, '-o', full_output_path, - '--print', 'filename', '--continue', '--no-progress', '--no-simulate', - '--no-write-info-json', '--ignore-errors', '--no-playlist', - ] - - if params.get('fragment_retries'): - cmd.extend(['--fragment-retries', str(params['fragment_retries'])]) - if params.get('limit_rate'): - cmd.extend(['--limit-rate', params['limit_rate']]) - if params.get('socket_timeout'): - cmd.extend(['--socket-timeout', str(params['socket_timeout'])]) - if params.get('min_sleep_interval'): - cmd.extend(['--min-sleep-interval', str(params['min_sleep_interval'])]) - if params.get('max_sleep_interval'): - cmd.extend(['--max-sleep-interval', str(params['max_sleep_interval'])]) - if params.get('yt_dlp_test_mode'): - cmd.append('--test') + """Constructs and runs a yt-ops-client download command, returning a list of final filenames.""" + downloader = params.get('downloader', 'py') + cmd = ['ytops-client', 'download', downloader, '--load-info-json', info_json_path, '-f', format_selector] - downloader = params.get('downloader', 'default') - if proxy and not (downloader == 'aria2c' and proxy.startswith('socks5://')): + if proxy: cmd.extend(['--proxy', proxy]) - gost_process = None - try: - if downloader == 'aria2c': - cmd.extend(['--downloader', 'aria2c']) - downloader_args = params.get('downloader_args_aria2c') - if proxy and proxy.startswith('socks5://'): - import socket - from contextlib import closing - def find_free_port(): - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: - s.bind(('', 0)) - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - return s.getsockname()[1] - local_port = find_free_port() - http_proxy = f"http://127.0.0.1:{local_port}" - logger.info(f"Starting gost for format '{format_selector}' to forward {proxy} to {http_proxy}") - gost_cmd = ['gost', '-L', f'http://127.0.0.1:{local_port}', '-F', proxy] - gost_process = subprocess.Popen(gost_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - time.sleep(1) - if gost_process.poll() is not None: - stdout, stderr = gost_process.communicate() - logger.error(f"gost failed to start. Exit: {gost_process.returncode}. Stdout: {stdout.decode()}. Stderr: {stderr.decode()}") - raise AirflowException("gost proxy tunnel failed to start.") - user_args = downloader_args[len('aria2c:'):] if downloader_args and downloader_args.startswith('aria2c:') else (downloader_args or "") - final_args_str = f'aria2c:{user_args.strip()} --http-proxy={http_proxy}' - cmd.extend(['--downloader-args', final_args_str]) - elif downloader_args: - cmd.extend(['--downloader-args', downloader_args]) + if downloader == 'py': + cmd.extend(['--output-dir', download_dir]) + # The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args + py_extra_args = [] + if params.get('fragment_retries'): + py_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])]) + if params.get('limit_rate'): + py_extra_args.extend(['--limit-rate', params['limit_rate']]) + if params.get('socket_timeout'): + py_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])]) + if params.get('min_sleep_interval'): + py_extra_args.extend(['--sleep-interval', str(params['min_sleep_interval'])]) + if params.get('max_sleep_interval'): + py_extra_args.extend(['--max-sleep-interval', str(params['max_sleep_interval'])]) + if params.get('yt_dlp_test_mode'): + py_extra_args.append('--test') - extra_args = params.get('yt_dlp_extra_args') - if extra_args: - cmd.extend(shlex.split(extra_args)) - if original_url: - cmd.append(original_url) + existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '') + final_extra_args = existing_extra + py_extra_args + if final_extra_args: + cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)]) - copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd) - logger.info(f"Executing yt-dlp command for format '{format_selector}': {copy_paste_cmd}") - process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600) - - if process.stdout: - logger.info(f"yt-dlp STDOUT for format '{format_selector}':\n{process.stdout}") - if process.stderr: - # yt-dlp often prints progress and informational messages to stderr - logger.info(f"yt-dlp STDERR for format '{format_selector}':\n{process.stderr}") + elif downloader == 'aria-rpc': + cmd.extend([ + '--aria-host', params.get('aria_host', '172.17.0.1'), + '--aria-port', str(params.get('aria_port', 6800)), + '--aria-secret', params.get('aria_secret'), + '--wait', '--auto-merge-fragments', + '--fragments-dir', download_dir, + '--output-dir', download_dir, + ]) + if params.get('yt_dlp_cleanup_mode'): + cmd.append('--cleanup') - if process.returncode != 0: - logger.error(f"yt-dlp failed for format '{format_selector}' with exit code {process.returncode}") - # STDOUT and STDERR are already logged above. - raise AirflowException(f"yt-dlp command failed for format '{format_selector}'.") + elif downloader == 'cli': + cmd.extend(['--output-dir', download_dir]) + # The 'cli' tool is the old yt-dlp wrapper, so it takes similar arguments. + cli_extra_args = [] + if params.get('fragment_retries'): + cli_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])]) + if params.get('limit_rate'): + cli_extra_args.extend(['--limit-rate', params['limit_rate']]) + if params.get('socket_timeout'): + cli_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])]) + if params.get('min_sleep_interval'): + cli_extra_args.extend(['--sleep-interval', str(params['min_sleep_interval'])]) + if params.get('max_sleep_interval'): + cli_extra_args.extend(['--max-sleep-interval', str(params['max_sleep_interval'])]) + if params.get('yt_dlp_test_mode'): + cli_extra_args.append('--test') - # In test mode, files are not created, so we only check that yt-dlp returned filenames. - # Otherwise, we verify that the files actually exist on disk. - output_files = [f for f in process.stdout.strip().split('\n') if f] - if not params.get('yt_dlp_test_mode'): - output_files = [f for f in output_files if os.path.exists(f)] + existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '') + final_extra_args = existing_extra + cli_extra_args + if final_extra_args: + cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)]) - if not output_files: - log_msg = (f"Test run for format '{format_selector}' did not produce any filenames." - if params.get('yt_dlp_test_mode') else - f"Download for format '{format_selector}' finished but no output files exist.") - exc_msg = (f"Test run for format '{format_selector}' did not produce any filenames." - if params.get('yt_dlp_test_mode') else - f"Download for format '{format_selector}' did not produce a file.") - - logger.error(log_msg) - logger.error(f"Full STDOUT:\n{process.stdout}") - logger.error(f"Full STDERR:\n{process.stderr}") - raise AirflowException(exc_msg) - - log_prefix = "SUCCESS (Test Mode):" if params.get('yt_dlp_test_mode') else "SUCCESS:" - logger.info(f"{log_prefix} Command for format '{format_selector}' complete. Files: {output_files}") - return output_files - finally: - if gost_process: - logger.info(f"Terminating gost process (PID: {gost_process.pid}) for format '{format_selector}'.") - gost_process.terminate() - try: - gost_process.wait(timeout=5) - except subprocess.TimeoutExpired: - gost_process.kill() - gost_process.wait() + copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd) + logger.info(f"Executing download command for format '{format_selector}': {copy_paste_cmd}") + process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600) + + if process.stdout: + logger.info(f"Download tool STDOUT for format '{format_selector}':\n{process.stdout}") + if process.stderr: + logger.info(f"Download tool STDERR for format '{format_selector}':\n{process.stderr}") + + if process.returncode != 0: + logger.error(f"Download tool failed for format '{format_selector}' with exit code {process.returncode}") + raise AirflowException(f"Download command failed for format '{format_selector}'. See logs for details.") + + output_files = [] + for line in process.stdout.strip().split('\n'): + # For aria-rpc, parse "Download and merge successful: " or "Download successful: " + match = re.search(r'successful: (.+)', line) + if match: + filepath = match.group(1).strip() + if os.path.exists(filepath): + output_files.append(filepath) + else: + logger.warning(f"File path from aria-rpc output does not exist locally: '{filepath}'") + # For py/cli, it's just the path + elif os.path.exists(line.strip()): + output_files.append(line.strip()) + + if not params.get('yt_dlp_test_mode') and not output_files: + raise AirflowException(f"Download for format '{format_selector}' finished but no output files were found or exist.") + + log_prefix = "SUCCESS (Test Mode):" if params.get('yt_dlp_test_mode') else "SUCCESS:" + logger.info(f"{log_prefix} Command for format '{format_selector}' complete. Files: {output_files}") + return output_files def run_ffmpeg_probe(filename): """Probes a file with ffmpeg to check for corruption.""" @@ -824,7 +814,7 @@ with DAG( type="string", enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'], title="Download Format Preset", - description="Select a predefined format string or choose 'custom'. To download multiple formats, this should be a comma-separated list of format IDs (e.g., '137,140').\nformats_0: 18,140\nformats_2: 18,140,299/298/137/136/135/134/133\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318" + description="Select a predefined format string or choose 'custom'. To download multiple formats, this should be a comma-separated list of format IDs (e.g., '137,140').\nformats_0: 18,140\nformats_2: 18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318" ), 'download_format_custom': Param( 'ba[ext=m4a]/bestaudio/best', @@ -833,18 +823,15 @@ with DAG( description="Custom yt-dlp format string. Used when preset is 'custom'. To download multiple formats, provide a comma-separated list of format IDs (e.g., '137,140')." ), 'downloader': Param( - 'default', + 'cli', type="string", - enum=['default', 'aria2c'], - title="Downloader", - description="Choose the downloader for yt-dlp." - ), - 'downloader_args_aria2c': Param( - 'aria2c:-x 4 -k 2M --max-download-limit=3M', - type="string", - title="Aria2c Downloader Arguments", - description="Arguments to pass to yt-dlp's --downloader-args. Used when downloader is 'aria2c'." + enum=['py', 'aria-rpc', 'cli'], + title="Download Tool", + description="Choose the download tool to use: 'py' (native python, recommended), 'aria-rpc' (send to aria2c daemon), 'cli' (legacy yt-dlp wrapper)." ), + 'aria_host': Param('172.17.0.1', type="string", title="Aria2c Host", description="For 'aria-rpc' downloader: Host of the aria2c RPC server."), + 'aria_port': Param(6800, type="integer", title="Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server."), + 'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="Aria2c Secret", description="For 'aria-rpc' downloader: Secret token."), 'yt_dlp_extra_args': Param( '--no-part --restrict-filenames', type=["string", "null"], diff --git a/ansible/playbook-master.yml b/ansible/playbook-master.yml index aff9806..594c782 100644 --- a/ansible/playbook-master.yml +++ b/ansible/playbook-master.yml @@ -5,6 +5,9 @@ vars_files: - "{{ inventory_dir }}/group_vars/all/generated_vars.yml" - "{{ inventory_dir }}/group_vars/all/vault.yml" + vars: + envoy_port: 9980 + envoy_admin_port: 9981 pre_tasks: - name: Announce master deployment debug: diff --git a/ytops_client/__init__.py b/ytops_client/__init__.py new file mode 100644 index 0000000..f66cb47 --- /dev/null +++ b/ytops_client/__init__.py @@ -0,0 +1 @@ +# This file makes 'ytops_client' a Python package. diff --git a/ytops_client/cli.py b/ytops_client/cli.py new file mode 100644 index 0000000..1e67fb0 --- /dev/null +++ b/ytops_client/cli.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +import sys +import argparse + +# Import the functions that define and execute the logic for each subcommand +from .list_formats_tool import add_list_formats_parser, main_list_formats +from .get_info_tool import add_get_info_parser, main_get_info +from .download_tool import add_download_parser, main_download +from .stress_policy_tool import add_stress_policy_parser, main_stress_policy +from .stress_formats_tool import add_stress_formats_parser, main_stress_formats +from .cookie_tool import add_cookie_tool_parser, main_cookie_tool +from .download_aria_tool import add_download_aria_parser, main_download_aria +from .download_native_py_tool import add_download_native_py_parser, main_download_native_py + +def main(): + """ + Main entry point for the yt-ops-client CLI. + Parses arguments and dispatches to the appropriate subcommand function. + """ + # Workaround for argparse behavior with positional arguments that start with a hyphen. + # If the command is 'get-info' and the last argument looks like a video ID + # starting with a '-', we insert '--' before it to tell argparse to treat it + # as a positional argument, not an option. This assumes the URL is the last argument. + if len(sys.argv) >= 3 and sys.argv[1] == 'get-info': + last_arg = sys.argv[-1] + # A YouTube video ID is 11 characters. + if last_arg.startswith('-') and len(last_arg) == 11: + import re + if re.fullmatch(r'-[a-zA-Z0-9_-]{10}', last_arg): + sys.argv.insert(len(sys.argv) - 1, '--') + + parser = argparse.ArgumentParser( + description="YT Ops Client Tools", + formatter_class=argparse.RawTextHelpFormatter + ) + subparsers = parser.add_subparsers(dest='command', help='Available sub-commands') + + # Add subparsers from each tool module + add_list_formats_parser(subparsers) + add_get_info_parser(subparsers) + + # Create a top-level 'download' command with its own subcommands + download_parser = subparsers.add_parser( + 'download', + help='Download using different methods.', + description='Provides access to various download tools. Use "download --help" for details.' + ) + download_subparsers = download_parser.add_subparsers(dest='download_command', help='Available downloaders', required=True) + add_download_parser(download_subparsers) # Adds 'cli' subcommand + add_download_native_py_parser(download_subparsers) # Adds 'py' subcommand + add_download_aria_parser(download_subparsers) # Adds 'aria-rpc' subcommand + + add_stress_policy_parser(subparsers) + add_stress_formats_parser(subparsers) + add_cookie_tool_parser(subparsers) + + args = parser.parse_args() + + # If no command is provided, print help and exit. + if not args.command: + parser.print_help() + return 1 + + # Dispatch to the correct main function based on the command + if args.command == 'list-formats': + return main_list_formats(args) + elif args.command == 'get-info': + return main_get_info(args) + elif args.command == 'download': + if args.download_command == 'cli': + return main_download(args) + elif args.download_command == 'py': + return main_download_native_py(args) + elif args.download_command == 'aria-rpc': + return main_download_aria(args) + elif args.command == 'stress-policy': + return main_stress_policy(args) + elif args.command == 'stress-formats': + return main_stress_formats(args) + elif args.command == 'convert-cookies': + return main_cookie_tool(args) + + # This path should not be reachable if a command is required or handled above. + parser.print_help() + return 1 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ytops_client/cookie_tool.py b/ytops_client/cookie_tool.py new file mode 100644 index 0000000..ff9b5df --- /dev/null +++ b/ytops_client/cookie_tool.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +""" +Tool to convert JSON cookies to the standard Netscape txt format. +""" + +import argparse +import json +import sys +import logging + +# Configure logging +logger = logging.getLogger('cookie_tool') + +def convert_json_to_netscape(json_data): + """ + Converts a list of cookie dictionaries to a Netscape format string. + """ + netscape_cookies = [] + # The header is optional but good practice for some tools. + netscape_cookies.append("# Netscape HTTP Cookie File") + netscape_cookies.append("# http://www.netscape.com/newsref/std/cookie_spec.html") + netscape_cookies.append("# This is a generated file! Do not edit.") + netscape_cookies.append("") + + if not isinstance(json_data, list): + raise TypeError("Input JSON must be a list of cookie objects.") + + for cookie in json_data: + if not isinstance(cookie, dict): + logger.warning(f"Skipping non-dictionary item in JSON list: {cookie}") + continue + + domain = cookie.get('domain', '') + # The 'hostOnly' flag determines if the domain is accessible to subdomains. + # Netscape format's flag is TRUE if subdomains can access it. + # So, hostOnly=false means flag=TRUE. + # A leading dot in the domain also implies this for some implementations. + if domain.startswith('.'): + include_subdomains = 'TRUE' + else: + include_subdomains = 'FALSE' if cookie.get('hostOnly', True) else 'TRUE' + + path = cookie.get('path', '/') + secure = 'TRUE' if cookie.get('secure', False) else 'FALSE' + + # Expiration date. If session cookie or no expiration, use 0. + if cookie.get('session', False) or 'expirationDate' not in cookie or cookie['expirationDate'] is None: + expires = 0 + else: + expires = int(cookie['expirationDate']) + + name = cookie.get('name', '') + value = str(cookie.get('value', '')) + + # Skip cookies without essential fields + if not domain or not name: + logger.warning(f"Skipping cookie with missing domain or name: {cookie}") + continue + + netscape_cookies.append( + f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}" + ) + + return "\n".join(netscape_cookies) + +def add_cookie_tool_parser(subparsers): + """Add the parser for the 'convert-cookies' command.""" + parser = subparsers.add_parser( + 'convert-cookies', + description='Convert JSON cookies to Netscape format.', + formatter_class=argparse.RawTextHelpFormatter, + help='Convert JSON cookies to Netscape format.', + epilog=""" +Reads a JSON array of cookie objects from stdin and prints the +Netscape cookie file format to stdout. + +Example JSON input format (per cookie): +{ + "domain": ".example.com", + "hostOnly": false, + "path": "/", + "secure": true, + "expirationDate": 1672531199, + "name": "my_cookie", + "value": "my_value" +} + +Example usage: +cat cookies.json | yt-ops-client convert-cookies > cookies.txt +""" + ) + parser.add_argument( + 'input_file', + nargs='?', + type=argparse.FileType('r', encoding='utf-8'), + default=sys.stdin, + help="Path to the JSON cookie file. Reads from stdin if not provided." + ) + parser.add_argument( + '-o', '--output', + type=argparse.FileType('w', encoding='utf-8'), + default=sys.stdout, + help="Output file path for the Netscape cookies. Defaults to stdout." + ) + parser.add_argument('--verbose', action='store_true', help='Enable verbose logging.') + return parser + +def main_cookie_tool(args): + """Main logic for the 'convert-cookies' command.""" + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s', stream=sys.stderr) + else: + logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s', stream=sys.stderr) + + try: + json_content = args.input_file.read() + if not json_content.strip(): + logger.error("Input is empty.") + return 1 + + cookie_data = json.loads(json_content) + netscape_string = convert_json_to_netscape(cookie_data) + + args.output.write(netscape_string + '\n') + + if args.output is not sys.stdout: + logger.info(f"Successfully converted cookies to {args.output.name}") + + return 0 + except json.JSONDecodeError: + logger.error("Invalid JSON provided. Please check the input file.") + return 1 + except TypeError as e: + logger.error(f"Error processing JSON: {e}") + return 1 + except Exception as e: + logger.error(f"An unexpected error occurred: {e}", exc_info=args.verbose) + return 1 diff --git a/ytops_client/download_aria_tool.py b/ytops_client/download_aria_tool.py new file mode 100644 index 0000000..a962408 --- /dev/null +++ b/ytops_client/download_aria_tool.py @@ -0,0 +1,687 @@ +#!/usr/bin/env python3 +""" +Tool to send a download to an aria2c daemon via RPC. +""" + +import argparse +import json +import logging +import sys +import os +import glob +import shutil +import re +import shlex +import time +from urllib.parse import urljoin + +try: + import aria2p + from aria2p.utils import human_readable_bytes +except ImportError: + print("aria2p is not installed. Please install it with: pip install aria2p", file=sys.stderr) + sys.exit(1) + +logger = logging.getLogger('download_aria_tool') + +class TimeoutError(Exception): + pass + + +def add_download_aria_parser(subparsers): + """Add the parser for the 'download aria-rpc' command.""" + parser = subparsers.add_parser( + 'aria-rpc', + description='Send a download to an aria2c daemon via RPC, using an info.json from stdin or a file.', + formatter_class=argparse.RawTextHelpFormatter, + help='Download a specific format using aria2c RPC.', + epilog=""" +Usage Notes for Fragmented Downloads (e.g., DASH): + +To download and automatically merge fragmented formats, you must: +1. Use '--wait' to make the operation synchronous. +2. Use '--auto-merge-fragments' to enable the merge logic. +3. Ensure this script has access to the directory where aria2c saves files. + +Example for a remote aria2c daemon: + - The remote daemon saves files to '/srv/downloads' on its machine. + - This directory is mounted locally at '/mnt/remote_aria2_downloads'. + +cat latest-info.json | yt-ops-client download aria-rpc -f "299/137" \\ + --wait --auto-merge-fragments \\ + --remote-dir /srv/downloads \\ + --fragments-dir /mnt/remote_aria2_downloads +""" + ) + parser.add_argument('--load-info-json', type=argparse.FileType('r', encoding='utf-8'), help="Path to the info.json file. If not provided, reads from stdin.") + parser.add_argument('-f', '--format', required=True, help='The format ID to download. Supports yt-dlp style format selectors (e.g., "137/136,140").') + parser.add_argument('--output-dir', help='Local directory to save the final merged file. Defaults to the current directory.') + parser.add_argument('--fragments-dir', help='The local path where this script should look for downloaded fragments. If the aria2c daemon is remote, this should be a local mount point corresponding to --remote-dir. Defaults to --output-dir.') + parser.add_argument('--remote-dir', help='The absolute path to the download directory on the remote aria2c host. This is passed via RPC.') + parser.add_argument('--aria-host', default='localhost', help='The host of the aria2c RPC server. Default: localhost.') + parser.add_argument('--aria-port', type=int, default=6800, help='The port of the aria2c RPC server. Default: 6800.') + parser.add_argument('--aria-secret', help='The secret token for the aria2c RPC server (often required, e.g., "SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX").') + parser.add_argument('--proxy', help='Proxy to use for the download, e.g., "socks5://127.0.0.1:1080".') + parser.add_argument('--downloader-args', help='Arguments for aria2c, in yt-dlp format (e.g., "aria2c:[-x 8, -k 1M]").') + parser.add_argument('--wait', action='store_true', help='Wait for the download to complete and report its status. Note: This makes the operation synchronous and will block until the download finishes.') + parser.add_argument('--wait-timeout', help='Timeout in seconds for waiting on downloads. Use "auto" to calculate based on a minimum speed of 200KiB/s. Requires --wait. Default: no timeout.') + parser.add_argument('--auto-merge-fragments', action='store_true', help='Automatically merge fragments after download. Requires --wait and assumes the script has filesystem access to the aria2c host.') + parser.add_argument('--remove-fragments-after-merge', action='store_true', help='Delete individual fragment files after a successful merge. Requires --auto-merge-fragments.') + parser.add_argument('--cleanup', action='store_true', help='After a successful download, remove the final file(s) from the filesystem. For fragmented downloads, this implies --remove-fragments-after-merge.') + parser.add_argument('--remove-on-complete', action=argparse.BooleanOptionalAction, default=True, help='Remove the download from aria2c history on successful completion. Use --no-remove-on-complete to disable. May fail on older aria2c daemons.') + parser.add_argument('--purge-on-complete', action='store_true', help='Use aria2.purgeDownloadResult to clear ALL completed/failed downloads from history on success. Use as a workaround for older daemons.') + parser.add_argument('--verbose', action='store_true', help='Enable verbose output for this script.') + return parser + +def cleanup_aria_download(api, downloads): + """Pause and remove downloads from aria2c.""" + if not downloads: + return + try: + logger.info(f"Attempting to clean up {len(downloads)} download(s) from aria2c...") + # Filter out downloads that might already be gone + valid_downloads = [d for d in downloads if hasattr(d, 'gid')] + if not valid_downloads: + logger.info("No valid downloads to clean up.") + return + api.pause(valid_downloads) + # Give aria2c a moment to process the pause command before removing + time.sleep(0.5) + api.remove(valid_downloads) + logger.info("Cleanup successful.") + except Exception as e: + logger.warning(f"An error occurred during aria2c cleanup: {e}") + + +def parse_aria_error(download): + """Parses an aria2p Download object to get a detailed error message.""" + error_code = download.error_code + error_message = download.error_message + + if not error_message: + return f"Unknown aria2c error (Code: {error_code})" + + # Check for common HTTP errors in the message + http_status_match = re.search(r'HTTP status (\d+)', error_message) + if http_status_match: + status_code = int(http_status_match.group(1)) + if status_code == 403: + return f"HTTP Error 403: Forbidden. The URL may have expired or requires valid cookies/headers." + elif status_code == 404: + return f"HTTP Error 404: Not Found. The resource is unavailable." + else: + return f"HTTP Error {status_code}." + + if "Timeout" in error_message or "timed out" in error_message.lower(): + return "Download timed out." + + # Fallback to the raw error message + return f"Aria2c error (Code: {error_code}): {error_message}" + + +def parse_aria_args_to_options(args_str): + """ + Parses yt-dlp style downloader args for aria2c. + Example: "aria2c:[-x 8, -k 1M]" or just "-x 8 -k 1M" + Returns a dictionary of options for aria2p. + """ + if not args_str or not args_str.strip(): + return {} + + inner_args_str = args_str.strip() + match = re.match(r'aria2c:\s*\[(.*)\]', inner_args_str) + if match: + # Handle yt-dlp's format + inner_args_str = match.group(1).replace(',', ' ') + else: + # If it doesn't match, assume the whole string is a set of arguments. + logger.debug(f"Downloader args '{args_str}' does not match 'aria2c:[...]' format. Parsing as a raw argument string.") + + arg_list = shlex.split(inner_args_str) + + # Use a mini-parser to handle CLI-style args + parser = argparse.ArgumentParser(add_help=False, prog="aria2c_args_parser") + parser.add_argument('-x', '--max-connection-per-server') + parser.add_argument('-k', '--min-split-size') + parser.add_argument('-s', '--split') + parser.add_argument('--all-proxy') + + try: + # We only care about known arguments + known_args, unknown_args = parser.parse_known_args(arg_list) + if unknown_args: + logger.warning(f"Ignoring unknown arguments in --downloader-args: {unknown_args}") + # Convert to dict, removing None values + return {k: v for k, v in vars(known_args).items() if v is not None} + except Exception: + logger.warning(f"Failed to parse arguments inside --downloader-args: '{inner_args_str}'") + return {} + + +def main_download_aria(args): + """Main logic for the 'download-aria' command.""" + log_level = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig(level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', stream=sys.stderr) + + if args.remove_fragments_after_merge and not args.auto_merge_fragments: + logger.error("--remove-fragments-after-merge requires --auto-merge-fragments.") + return 1 + if args.auto_merge_fragments and not args.wait: + logger.error("--auto-merge-fragments requires --wait.") + return 1 + if args.wait_timeout and not args.wait: + logger.error("--wait-timeout requires --wait.") + return 1 + + if args.wait: + logger.info("Will wait for download to complete and report status. This is a synchronous operation.") + else: + logger.info("Will submit download and exit immediately (asynchronous).") + + info_json_content = "" + input_source_name = "" + if args.load_info_json: + info_json_content = args.load_info_json.read() + input_source_name = args.load_info_json.name + else: + info_json_content = sys.stdin.read() + input_source_name = "stdin" + + if not info_json_content.strip(): + logger.error(f"Failed to read info.json from {input_source_name}. Input is empty.") + return 1 + + try: + info_data = json.loads(info_json_content) + logger.info(f"Successfully loaded info.json from {input_source_name}.") + except json.JSONDecodeError: + logger.error(f"Failed to parse info.json from {input_source_name}. Is the input valid JSON?") + return 1 + + # Find the requested format, supporting yt-dlp style selectors + target_format = None + # A format selector can be a comma-separated list of preferences, + # where each preference can be a slash-separated list of format_ids. + # e.g., "299/137/136,140" means try 299, then 137, then 136, then 140. + format_preferences = [item.strip() for sublist in (i.split('/') for i in args.format.split(',')) for item in sublist if item.strip()] + + available_formats_map = {f['format_id']: f for f in info_data.get('formats', []) if 'format_id' in f} + + for format_id in format_preferences: + if format_id in available_formats_map: + target_format = available_formats_map[format_id] + logger.info(f"Selected format ID '{format_id}' from selector '{args.format}'.") + break + + if not target_format: + logger.error(f"No suitable format found for selector '{args.format}' in info.json.") + return 1 + + # Get file size for auto-timeout and dynamic options + total_filesize = target_format.get('filesize') or target_format.get('filesize_approx') + + # Construct filename + video_id = info_data.get('id', 'unknown_video_id') + title = info_data.get('title', 'unknown_title') + ext = target_format.get('ext', 'mp4') + # Sanitize title for filename + safe_title = "".join([c for c in title if c.isalpha() or c.isdigit() or c in (' ', '-', '_')]).rstrip() + filename = f"{safe_title} [{video_id}].f{target_format['format_id']}.{ext}" + + # Prepare options for aria2 + aria_options = { + # Options from yt-dlp's aria2c integration for performance and reliability + 'max-connection-per-server': 16, + 'split': 16, + 'min-split-size': '1M', + 'http-accept-gzip': 'true', + 'file-allocation': 'none', + } + + if args.proxy: + aria_options['all-proxy'] = args.proxy + + custom_options = parse_aria_args_to_options(args.downloader_args) + + # Dynamically set min-split-size if not overridden by user + if 'min_split_size' not in custom_options and total_filesize: + if total_filesize > 100 * 1024 * 1024: # 100 MiB + aria_options['min-split-size'] = '5M' + logger.info("File is > 100MiB, dynamically setting min-split-size to 5M.") + + if custom_options: + aria_options.update(custom_options) + logger.info(f"Applied custom aria2c options from --downloader-args: {custom_options}") + + aria_options['out'] = filename + + # Add headers from info.json, mimicking yt-dlp's behavior for aria2c + headers = target_format.get('http_headers') + if headers: + header_list = [f'{key}: {value}' for key, value in headers.items()] + aria_options['header'] = header_list + logger.info(f"Adding {len(header_list)} HTTP headers to the download.") + if args.verbose: + for h in header_list: + if h.lower().startswith('cookie:'): + logger.debug(f" Header: Cookie: [REDACTED]") + else: + logger.debug(f" Header: {h}") + + is_fragmented = 'fragments' in target_format + if not is_fragmented: + url = target_format.get('url') + if not url: + logger.error(f"Format ID '{args.format}' has neither a URL nor fragments.") + return 1 + + try: + logger.info(f"Connecting to aria2c RPC at http://{args.aria_host}:{args.aria_port}") + client = aria2p.Client( + host=f"http://{args.aria_host}", + port=args.aria_port, + secret=args.aria_secret or "" + ) + api = aria2p.API(client) + + timeout_seconds = None + if args.wait_timeout: + if args.wait_timeout.lower() == 'auto': + if total_filesize: + # Min speed: 200 KiB/s. Min timeout: 30s. + min_speed = 200 * 1024 + calculated_timeout = int(total_filesize / min_speed) + timeout_seconds = max(30, calculated_timeout) + total_filesize_hr, _ = human_readable_bytes(total_filesize) + logger.info(f"Auto-calculated timeout: {timeout_seconds}s (based on {total_filesize_hr} at 200KiB/s).") + else: + logger.warning("Cannot use 'auto' timeout: file size not available in info.json. Timeout disabled.") + else: + try: + timeout_seconds = int(args.wait_timeout) + if timeout_seconds <= 0: + raise ValueError + except ValueError: + logger.error(f"Invalid --wait-timeout value: '{args.wait_timeout}'. Must be a positive integer or 'auto'.") + return 1 + + if is_fragmented: + return download_fragments_aria(args, api, target_format, filename, aria_options, timeout_seconds, remote_dir=args.remote_dir) + else: + return download_url_aria(args, api, url, filename, aria_options, timeout_seconds, remote_dir=args.remote_dir) + + except Exception as e: + logger.error(f"An error occurred while communicating with aria2c: {e}", exc_info=args.verbose) + return 1 + +def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, remote_dir=None): + """Handle downloading a single URL with aria2c.""" + if remote_dir: + aria_options['dir'] = remote_dir + logger.info(f"Adding download for format '{args.format}' with URL: {url[:70]}...") + downloads = api.add_uris([url], options=aria_options) + + if not downloads: + logger.error("Failed to add download to aria2c. The API returned an empty result.") + return 1 + + # Handle older aria2p versions that return a single Download object instead of a list + download = downloads[0] if isinstance(downloads, list) else downloads + logger.info(f"Successfully added download to aria2c. GID: {download.gid}") + + if args.wait: + logger.info(f"Waiting for download {download.gid} to complete...") + start_time = time.time() + try: + while True: + if timeout_seconds and (time.time() - start_time > timeout_seconds): + raise TimeoutError(f"Download did not complete within {timeout_seconds}s timeout.") + + # Re-fetch the download object to get the latest status + download.update() + # A download is no longer active if it's complete, errored, paused, or removed. + if download.status not in ('active', 'waiting'): + break + + progress_info = ( + f"\rGID {download.gid}: {download.status} " + f"{download.progress_string()} " + f"({download.download_speed_string()}) " + f"ETA: {download.eta_string()}" + ) + sys.stdout.write(progress_info) + sys.stdout.flush() + time.sleep(0.5) + except (KeyboardInterrupt, TimeoutError) as e: + sys.stdout.write('\n') + if isinstance(e, KeyboardInterrupt): + logger.warning("Wait interrupted by user. Cleaning up download...") + cleanup_aria_download(api, [download]) + return 130 + else: # TimeoutError + logger.error(f"Download timed out. Cleaning up... Error: {e}") + cleanup_aria_download(api, [download]) + return 1 + except aria2p.ClientException as e: + # This can happen if the download completes and is removed by aria2c + # before we can check its final status. Assume success in this case. + logger.warning(f"Could not get final status for GID {download.gid} (maybe removed on completion?): {e}. Assuming success.") + print(f"Download for GID {download.gid} presumed successful.") + return 0 + + sys.stdout.write('\n') # Newline after progress bar + + # Final status check (no need to update again, we have the latest status) + if download.status == 'complete': + logger.info(f"Download {download.gid} completed successfully.") + + downloaded_filepath_remote = None + if download.files: + downloaded_filepath_remote = download.files[0].path + print(f"Download successful: {downloaded_filepath_remote}") + else: + print("Download successful, but no file path reported by aria2c.") + + if args.cleanup and downloaded_filepath_remote: + local_filepath = None + # To map remote path to local, we need remote_dir and a local equivalent. + # We'll use fragments_dir as the local equivalent, which defaults to output_dir. + local_base_dir = args.fragments_dir or args.output_dir or '.' + if remote_dir: + if downloaded_filepath_remote.startswith(remote_dir): + relative_path = os.path.relpath(downloaded_filepath_remote, remote_dir) + local_filepath = os.path.join(local_base_dir, relative_path) + else: + logger.warning(f"Cleanup: Downloaded file path '{downloaded_filepath_remote}' does not start with remote-dir '{remote_dir}'. Cannot map to local path.") + else: + logger.warning(f"Cleanup: --remote-dir not specified. Assuming download path is accessible locally as '{downloaded_filepath_remote}'.") + local_filepath = downloaded_filepath_remote + + if local_filepath: + try: + if os.path.exists(local_filepath): + os.remove(local_filepath) + logger.info(f"Cleanup: Removed downloaded file '{local_filepath}'") + else: + logger.warning(f"Cleanup: File not found at expected local path '{local_filepath}'. Skipping removal.") + except OSError as e: + logger.error(f"Cleanup failed: Could not remove file '{local_filepath}': {e}") + elif args.cleanup: + logger.warning("Cleanup requested, but no downloaded file path was reported by aria2c.") + + if args.purge_on_complete: + try: + api.purge_download_result() + logger.info("Purged all completed/failed downloads from aria2c history.") + except Exception as e: + logger.warning(f"Failed to purge download history: {e}") + elif args.remove_on_complete: + try: + api.remove_download_result(download) + logger.info(f"Removed download {download.gid} from aria2c history.") + except Exception as e: + logger.warning(f"Failed to remove download {download.gid} from history: {e}") + + return 0 + else: + detailed_error = parse_aria_error(download) + logger.error(f"Download {download.gid} failed. Error: {detailed_error}") + return 1 + else: + print(f"Successfully added download. GID: {download.gid}") + return 0 + +def download_fragments_aria(args, api, target_format, filename, aria_options, timeout_seconds, remote_dir=None): + """Handle downloading fragmented formats with aria2c.""" + logger.info(f"Format '{args.format}' is fragmented. Adding all fragments to download queue.") + fragment_base_url = target_format.get('fragment_base_url') + fragments = target_format['fragments'] + + MAX_FRAGMENTS = 50000 + if len(fragments) > MAX_FRAGMENTS: + logger.error( + f"The number of fragments ({len(fragments)}) exceeds the safety limit of {MAX_FRAGMENTS}. " + f"This is to prevent overwhelming the aria2c server. Aborting." + ) + return 1 + + # We need to set the 'dir' option for all fragments if specified. + # The 'out' option will be set per-fragment. + frag_aria_options = aria_options.copy() + frag_aria_options.pop('out', None) # Remove the main 'out' option + + if remote_dir: + frag_aria_options['dir'] = remote_dir + logger.info(f"Instructing remote aria2c to save fragments to: {remote_dir}") + + base_filename, file_ext = os.path.splitext(filename) + + calls = [] + for i, fragment in enumerate(fragments): + frag_url = fragment.get('url') + if not frag_url: + if not fragment_base_url: + logger.error(f"Fragment {i} has no URL and no fragment_base_url is available. Aborting.") + return 1 + frag_url = urljoin(fragment_base_url, fragment['path']) + + # Use the base filename from the main file, but add fragment identifier + fragment_filename = f"{base_filename}-Frag{i}{file_ext}" + + current_frag_options = frag_aria_options.copy() + current_frag_options['out'] = os.path.basename(fragment_filename) + + # Prepare parameters for multicall in the format: + # {"methodName": "aria2.addUri", "params": [["url"], {"out": "file.mp4"}]} + # The secret token is automatically added by aria2p. + params = [[frag_url], current_frag_options] + call_struct = { + "methodName": api.client.ADD_URI, + "params": params + } + calls.append(call_struct) + + results = api.client.multicall(calls) + if not results: + logger.error("Failed to add fragments to aria2c. The API returned an empty result.") + return 1 + + # The result of a multicall of addUri is a list of lists, where each inner list + # contains the GID of one download, e.g., [['gid1'], ['gid2']]. + # A failed call for a fragment may result in a fault struct dict instead of a list. + # We extract GIDs from successful calls. + gids = [result[0] for result in results if isinstance(result, list) and result] + + if len(gids) != len(fragments): + failed_count = len(fragments) - len(gids) + logger.warning(f"{failed_count} out of {len(fragments)} fragments failed to be added to aria2c.") + + if not gids: + logger.error("Failed to add any fragments to aria2c. All submissions failed.") + return 1 + + logger.info(f"Successfully added {len(gids)} fragments to aria2c.") + if args.verbose: + logger.debug(f"GIDs: {gids}") + + if args.wait: + logger.info(f"Waiting for {len(gids)} fragments to complete...") + start_time = time.time() + downloads_to_cleanup = [] + try: + while True: + if timeout_seconds and (time.time() - start_time > timeout_seconds): + raise TimeoutError(f"Fragment downloads did not complete within {timeout_seconds}s timeout.") + + downloads = api.get_downloads(gids) + downloads_to_cleanup = downloads # Store for potential cleanup + # A download is considered "active" if it's currently downloading or waiting in the queue. + # It is "not active" if it is complete, errored, paused, or removed. + active_downloads = [d for d in downloads if d.status in ('active', 'waiting')] + if not active_downloads: + break # All downloads are complete or have stopped for other reasons + + for d in active_downloads: + d.update() + + completed_count = len(downloads) - len(active_downloads) + total_bytes = sum(d.total_length for d in downloads) + downloaded_bytes = sum(d.completed_length for d in downloads) + total_speed = sum(d.download_speed for d in downloads) + progress_percent = (downloaded_bytes / total_bytes * 100) if total_bytes > 0 else 0 + + progress_info = ( + f"\rProgress: {completed_count}/{len(downloads)} fragments | " + f"{progress_percent:.1f}% " + f"({human_readable_bytes(downloaded_bytes)}/{human_readable_bytes(total_bytes)}) " + f"Speed: {human_readable_bytes(total_speed)}/s" + ) + sys.stdout.write(progress_info) + sys.stdout.flush() + time.sleep(0.5) + except (KeyboardInterrupt, TimeoutError) as e: + sys.stdout.write('\n') + if isinstance(e, KeyboardInterrupt): + logger.warning("Wait interrupted by user. Cleaning up fragments...") + cleanup_aria_download(api, downloads_to_cleanup) + return 130 + else: # TimeoutError + logger.error(f"Download timed out. Cleaning up fragments... Error: {e}") + cleanup_aria_download(api, downloads_to_cleanup) + return 1 + except aria2p.ClientException as e: + # This can happen if downloads complete and are removed by aria2c + # before we can check their final status. Assume success in this case. + logger.warning(f"Could not get final status for some fragments (maybe removed on completion?): {e}. Assuming success.") + + sys.stdout.write('\n') + + # Final status check + failed_downloads = [] + try: + downloads = api.get_downloads(gids) + failed_downloads = [d for d in downloads if d.status != 'complete'] + except aria2p.ClientException as e: + logger.warning(f"Could not perform final status check for fragments (maybe removed on completion?): {e}. Assuming success.") + # If we can't check, we assume success based on the earlier wait loop not failing catastrophically. + failed_downloads = [] + + if failed_downloads: + logger.error(f"{len(failed_downloads)} fragments failed to download.") + for d in failed_downloads: + detailed_error = parse_aria_error(d) + logger.error(f" GID {d.gid}: {detailed_error}") + return 1 + else: + logger.info("All fragments downloaded successfully.") + output_dir = args.output_dir or '.' + final_filepath = os.path.join(output_dir, filename) + fragments_lookup_dir = args.fragments_dir or output_dir + + if args.auto_merge_fragments: + logger.info(f"Attempting to merge fragments into: {final_filepath}") + logger.info(f"Searching for fragments in local directory: {os.path.abspath(fragments_lookup_dir)}") + try: + # base_filename and file_ext are available from earlier in the function + # We must escape the base filename in case it contains glob special characters like [ or ]. + escaped_base = glob.escape(base_filename) + search_path = os.path.join(fragments_lookup_dir, f"{escaped_base}-Frag*{file_ext}") + fragment_files = glob.glob(search_path) + + if not fragment_files: + logger.error(f"No fragment files found with pattern: {search_path}") + return 1 + + def fragment_sort_key(f): + match = re.search(r'Frag(\d+)', os.path.basename(f)) + return int(match.group(1)) if match else -1 + fragment_files.sort(key=fragment_sort_key) + + with open(final_filepath, 'wb') as dest_file: + for frag_path in fragment_files: + with open(frag_path, 'rb') as src_file: + shutil.copyfileobj(src_file, dest_file) + + logger.info(f"Successfully merged {len(fragment_files)} fragments into {final_filepath}") + + if args.remove_fragments_after_merge or args.cleanup: + logger.info("Removing fragment files...") + for frag_path in fragment_files: + os.remove(frag_path) + logger.info("Fragment files removed.") + + if args.cleanup: + try: + os.remove(final_filepath) + logger.info(f"Cleanup: Removed merged file '{final_filepath}'") + except OSError as e: + logger.error(f"Cleanup failed: Could not remove merged file '{final_filepath}': {e}") + + print(f"Download and merge successful: {final_filepath}") + + if args.purge_on_complete: + try: + api.purge_download_result() + logger.info("Purged all completed/failed downloads from aria2c history.") + except Exception as e: + logger.warning(f"Failed to purge download history: {e}") + elif args.remove_on_complete: + try: + # The `downloads` variable from the last status check should be valid here. + api.remove_download_result(downloads) + logger.info(f"Removed {len(downloads)} fragment downloads from aria2c history.") + except aria2p.ClientException as e: + logger.warning(f"Could not remove fragment downloads from history (maybe already gone?): {e}") + except Exception as e: + logger.warning(f"Failed to remove fragment downloads from history: {e}") + + return 0 + + except Exception as e: + logger.error(f"An error occurred during merging: {e}", exc_info=args.verbose) + logger.error("Fragments were downloaded but not merged.") + return 1 + else: + print("Download successful. Fragments now need to be merged manually.") + print(f"The final merged file should be named: {final_filepath}") + print("You can merge them with a command like:") + print(f" cat `ls -v '{os.path.join(fragments_lookup_dir, base_filename)}'-Frag*'{file_ext}'` > '{final_filepath}'") + + if args.cleanup: + logger.info("Cleanup requested. Removing downloaded fragments...") + try: + # base_filename and file_ext are available from earlier in the function + escaped_base = glob.escape(base_filename) + search_path = os.path.join(fragments_lookup_dir, f"{escaped_base}-Frag*{file_ext}") + fragment_files = glob.glob(search_path) + + if not fragment_files: + logger.warning(f"Cleanup: No fragment files found with pattern: {search_path}") + else: + for frag_path in fragment_files: + os.remove(frag_path) + logger.info(f"Removed {len(fragment_files)} fragment files.") + except Exception as e: + logger.error(f"An error occurred during fragment cleanup: {e}", exc_info=args.verbose) + + if args.purge_on_complete: + try: + api.purge_download_result() + logger.info("Purged all completed/failed downloads from aria2c history.") + except Exception as e: + logger.warning(f"Failed to purge download history: {e}") + elif args.remove_on_complete: + try: + # The `downloads` variable from the last status check should be valid here. + api.remove_download_result(downloads) + logger.info(f"Removed {len(downloads)} fragment downloads from aria2c history.") + except aria2p.ClientException as e: + logger.warning(f"Could not remove fragment downloads from history (maybe already gone?): {e}") + except Exception as e: + logger.warning(f"Failed to remove fragment downloads from history: {e}") + + return 0 + else: + print(f"Successfully added {len(gids)} fragments. GIDs: {gids}") + print("These fragments will need to be merged manually after download.") + return 0 diff --git a/ytops_client/download_native_py_tool.py b/ytops_client/download_native_py_tool.py new file mode 100644 index 0000000..0cb18ea --- /dev/null +++ b/ytops_client/download_native_py_tool.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 +""" +Tool to download a specified format using yt-dlp as a Python library. +""" + +import argparse +import contextlib +import io +import json +import logging +import os +import re +import shlex +import sys +import time +from datetime import datetime + +try: + import yt_dlp +except ImportError: + print("yt-dlp is not installed. Please install it with: pip install yt-dlp", file=sys.stderr) + sys.exit(1) + +logger = logging.getLogger('download_native_py_tool') + +# A custom logger for yt-dlp to capture output and key events +class YTDLPLogger: + def __init__(self): + self.final_filename = None + self.is_403 = False + self.is_timeout = False + + def debug(self, msg): + # yt-dlp logs the destination file path at the debug level. + if msg.startswith('[download] Destination:'): + self.final_filename = msg.split(':', 1)[1].strip() + elif msg.startswith('[download]') and 'has already been downloaded' in msg: + match = re.search(r'\[download\]\s+(.*)\s+has already been downloaded', msg) + if match: + self.final_filename = match.group(1).strip() + logger.debug(msg) + + def info(self, msg): + logger.info(msg) + + def warning(self, msg): + logger.warning(msg) + + def error(self, msg): + if "HTTP Error 403" in msg: + self.is_403 = True + if "Read timed out" in msg: + self.is_timeout = True + logger.error(msg) + +def ytdlp_progress_hook(d, ytdlp_logger): + """Progress hook to capture the final filename.""" + if d['status'] == 'finished': + ytdlp_logger.final_filename = d.get('filename') + logger.info(f"Download finished. Final file: {ytdlp_logger.final_filename}") + +def add_download_native_py_parser(subparsers): + """Add the parser for the 'download py' command.""" + parser = subparsers.add_parser( + 'py', + description='Download using yt-dlp as a Python library (recommended). This method calls yt-dlp functions directly.', + formatter_class=argparse.RawTextHelpFormatter, + help='Download using a direct Python call to yt-dlp (recommended).' + ) + parser.add_argument('--load-info-json', type=argparse.FileType('r', encoding='utf-8'), help="Path to the info.json file. If not provided, reads from stdin.") + parser.add_argument('-f', '--format', required=True, help='The format selection string to download (e.g., "18", "299/137", "bestvideo+bestaudio").') + parser.add_argument('--output-dir', default='.', help='Directory to save the downloaded file. Defaults to current directory.') + parser.add_argument('--save-info-json-dir', help='If specified, save the info.json received from stdin to this directory with an auto-generated name.') + parser.add_argument('--proxy', help='Proxy to use for the download, e.g., "socks5://127.0.0.1:1080".') + parser.add_argument('--proxy-rename', help='Apply sed-style regex substitution to the proxy URL. Format: s/pattern/replacement/') + parser.add_argument('--temp-path', help='Directory for temporary files (e.g., fragments). Use a RAM disk for best performance.') + parser.add_argument('--pause', type=int, default=0, help='Seconds to wait before starting the download.') + parser.add_argument('--download-continue', action='store_true', help='Enable download continuation (--no-overwrites and --continue flags for yt-dlp).') + parser.add_argument('--verbose', action='store_true', help='Enable verbose output for this script and yt-dlp.') + parser.add_argument('--cli-config', help='Path to a yt-dlp configuration file to load.') + parser.add_argument('--downloader', help='Name of the external downloader backend for yt-dlp to use (e.g., "aria2c", "native").') + parser.add_argument('--downloader-args', help='Arguments to pass to the external downloader backend (e.g., "aria2c:-x 8").') + parser.add_argument('--extra-ytdlp-args', help='A string of extra command-line arguments to pass to yt-dlp.') + parser.add_argument('--output-buffer', action='store_true', help='Download to an in-memory buffer and print raw bytes to stdout. Final filename is printed to stderr.') + parser.add_argument('--cleanup', action='store_true', help='After download, rename the file to include a timestamp and truncate it to 0 bytes.') + parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.') + return parser + +def main_download_native_py(args): + """Main logic for the 'download-native-py' command.""" + # If outputting to buffer, all logging must go to stderr to keep stdout clean for binary data. + log_stream = sys.stderr if args.output_buffer else sys.stdout + log_level = logging.DEBUG if args.verbose else logging.INFO + # Reconfigure root logger + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) + logging.basicConfig(level=log_level, stream=log_stream, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + + if args.pause > 0: + logger.info(f"Pausing for {args.pause} seconds...") + time.sleep(args.pause) + + info_json_content = "" + input_source_name = "" + if args.load_info_json: + info_json_content = args.load_info_json.read() + input_source_name = args.load_info_json.name + else: + info_json_content = sys.stdin.read() + input_source_name = "stdin" + + if not info_json_content.strip(): + logger.error(f"Failed to read info.json from {input_source_name}. Input is empty.") + return 1 + + try: + info_data = json.loads(info_json_content) + logger.info(f"Successfully loaded info.json from {input_source_name}.") + except json.JSONDecodeError: + logger.error(f"Failed to parse info.json from {input_source_name}. Is the input valid JSON?") + return 1 + + if args.save_info_json_dir: + try: + video_id = info_data.get('id', 'unknown_video_id') + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + filename = f"{timestamp}-{video_id}-info.json" + output_path = os.path.join(args.save_info_json_dir, filename) + os.makedirs(args.save_info_json_dir, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(info_data, f, indent=2) + logger.info(f"Saved info.json to {output_path}") + except Exception as e: + logger.error(f"Failed to save info.json: {e}") + + # Handle proxy and proxy rename + proxy_url = args.proxy + if not proxy_url: + proxy_url = info_data.get('_proxy_url') + if proxy_url: + logger.info(f"Using proxy from info.json: {proxy_url}") + + if proxy_url and args.proxy_rename: + rename_rule = args.proxy_rename.strip("'\"") + if rename_rule.startswith('s/') and rename_rule.count('/') >= 2: + try: + parts = rename_rule.split('/') + pattern, replacement = parts[1], parts[2] + original_proxy = proxy_url + proxy_url = re.sub(pattern, replacement, proxy_url) + logger.info(f"Renamed proxy URL from '{original_proxy}' to '{proxy_url}' using rule '{rename_rule}'") + except re.error as e: + logger.error(f"Invalid regex in --proxy-rename: {e}") + return 1 + else: + logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/") + return 1 + + # Build the yt-dlp options dictionary + # Start by parsing options from config file and extra args to establish a baseline. + base_opts_args = [] + if args.cli_config and os.path.exists(args.cli_config): + try: + with open(args.cli_config, 'r', encoding='utf-8') as f: + config_content = f.read() + base_opts_args.extend(shlex.split(config_content)) + logger.info(f"Loaded {len(base_opts_args)} arguments from config file: {args.cli_config}") + except Exception as e: + logger.error(f"Failed to read or parse config file {args.cli_config}: {e}") + return 1 + elif args.cli_config: + logger.warning(f"Config file '{args.cli_config}' not found. Ignoring.") + + if args.extra_ytdlp_args: + extra_args_list = shlex.split(args.extra_ytdlp_args) + logger.info(f"Adding {len(extra_args_list)} extra arguments from --extra-ytdlp-args.") + base_opts_args.extend(extra_args_list) + + ydl_opts = {} + if base_opts_args: + try: + # This is an internal API, but it's the most accurate way to parse CLI args + # into the ydl_opts dictionary format. + ydl_opts, _, _ = yt_dlp.parse_options(base_opts_args) + except Exception as e: + logger.error(f"Failed to parse options from config/extra_args: {e}") + return 1 + + # Now, layer the script's explicit arguments on top, as they have higher precedence. + os.makedirs(args.output_dir, exist_ok=True) + output_template = os.path.join(args.output_dir, '%(title)s [%(id)s].f%(format_id)s.%(ext)s') + + ytdlp_logger = YTDLPLogger() + + # Use update to merge, so explicit args overwrite config/extra args. + ydl_opts.update({ + 'format': args.format, + 'outtmpl': '-' if args.output_buffer else output_template, + 'logger': ytdlp_logger, + 'progress_hooks': [lambda d: ytdlp_progress_hook(d, ytdlp_logger)], + 'verbose': args.verbose, + }) + + if args.temp_path: + ydl_opts['paths'] = {'temp': args.temp_path} + logger.info(f"Using temporary path: {args.temp_path}") + + if args.download_continue: + ydl_opts['continuedl'] = True + ydl_opts['nooverwrites'] = True + + if proxy_url: + ydl_opts['proxy'] = proxy_url + + if args.downloader: + ydl_opts['downloader'] = {args.downloader: None} + if args.downloader_args: + # yt-dlp expects a dict for downloader_args + # e.g., {'aria2c': ['-x', '8']} + try: + downloader_name, args_str = args.downloader_args.split(':', 1) + ydl_opts.setdefault('downloader_args', {})[downloader_name] = shlex.split(args_str) + except ValueError: + logger.error(f"Invalid --downloader-args format. Expected 'downloader:args'. Got: '{args.downloader_args}'") + return 1 + + if args.merge_output_format: + ydl_opts['merge_output_format'] = args.merge_output_format + + try: + logger.info(f"Starting download for format '{args.format}' using yt-dlp library...") + + download_buffer = None + if args.output_buffer: + # When downloading to buffer, we redirect stdout to capture the binary data. + download_buffer = io.BytesIO() + ctx_mgr = contextlib.redirect_stdout(download_buffer) + else: + # Otherwise, use a null context manager. + ctx_mgr = contextlib.nullcontext() + + with ctx_mgr, yt_dlp.YoutubeDL(ydl_opts) as ydl: + # The download() method is for URLs. For a pre-fetched info dict, + # we must use process_ie_result to bypass the info extraction step. + # It raises DownloadError on failure, which is caught by the outer try...except block. + ydl.process_ie_result(info_data) + # If process_ie_result completes without an exception, the download was successful. + retcode = 0 + + # The success path is now always taken if no exception was raised. + if retcode == 0: + logger.info("yt-dlp download completed successfully.") + + if args.output_buffer: + # Write the captured binary data to the actual stdout. + sys.stdout.buffer.write(download_buffer.getvalue()) + sys.stdout.buffer.flush() + # Print the filename to stderr for the orchestrator. + if ytdlp_logger.final_filename: + print(ytdlp_logger.final_filename, file=sys.stderr) + else: + # Print the filename to stdout as usual. + if ytdlp_logger.final_filename: + print(ytdlp_logger.final_filename, file=sys.stdout) + + if args.cleanup: + downloaded_filepath = ytdlp_logger.final_filename + if downloaded_filepath and os.path.exists(downloaded_filepath): + try: + logger.info(f"Cleanup: Renaming and truncating '{downloaded_filepath}'") + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + directory, original_filename = os.path.split(downloaded_filepath) + filename_base, filename_ext = os.path.splitext(original_filename) + new_filename = f"{filename_base}_{timestamp}{filename_ext}.empty" + new_filepath = os.path.join(directory, new_filename) + os.rename(downloaded_filepath, new_filepath) + logger.info(f"Renamed to '{new_filepath}'") + with open(new_filepath, 'w') as f: + pass + logger.info(f"Truncated '{new_filepath}' to 0 bytes.") + except Exception as e: + logger.error(f"Cleanup failed: {e}") + return 1 # Treat cleanup failure as a script failure + elif not args.output_buffer: + logger.warning("Cleanup requested, but no downloaded file was found. Skipping cleanup.") + return 0 + else: + logger.error(f"yt-dlp download failed with internal exit code {retcode}.") + return 1 + + except yt_dlp.utils.DownloadError as e: + # This catches download-specific errors from yt-dlp + logger.error(f"yt-dlp DownloadError: {e}") + return 1 + except Exception as e: + logger.exception(f"An unexpected error occurred during yt-dlp execution: {e}") + return 1 diff --git a/ytops_client/download_tool.py b/ytops_client/download_tool.py new file mode 100644 index 0000000..b61aa04 --- /dev/null +++ b/ytops_client/download_tool.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 +""" +Tool to download a specified format using an info.json from stdin. +""" + +import argparse +import json +import logging +import os +import re +import shlex +import subprocess +import sys +import tempfile +import time +from datetime import datetime + +# Configure logging +logger = logging.getLogger('download_tool') + +def add_download_parser(subparsers): + """Add the parser for the 'download cli' command.""" + parser = subparsers.add_parser( + 'cli', + description='Download using the legacy yt-dlp CLI wrapper. This method invokes yt-dlp as a subprocess.', + formatter_class=argparse.RawTextHelpFormatter, + help='Download using the legacy yt-dlp CLI wrapper.' + ) + parser.add_argument('--load-info-json', type=argparse.FileType('r', encoding='utf-8'), help="Path to the info.json file. If not provided, reads from stdin.") + parser.add_argument('-f', '--format', required=True, help='The format selection string to download (e.g., "18", "299/137", "bestvideo+bestaudio").') + parser.add_argument('--output-dir', default='.', help='Directory to save the downloaded file. Defaults to current directory.') + parser.add_argument('--save-info-json-dir', help='If specified, save the info.json received from stdin to this directory with an auto-generated name.') + parser.add_argument('--proxy', help='Proxy to use for the download, e.g., "socks5://127.0.0.1:1080". This option sets the proxy, overriding any value from the info.json.') + parser.add_argument('--proxy-rename', help='Apply sed-style regex substitution to the proxy URL. Format: s/pattern/replacement/') + parser.add_argument('--pause', type=int, default=0, help='Seconds to wait before starting the download.') + parser.add_argument('--print-traffic', action='store_true', help='Print traffic instead of a progress bar.') + parser.add_argument('--download-continue', action='store_true', help='Enable download continuation (--continue and --part flags for yt-dlp).') + parser.add_argument('--verbose', action='store_true', help='Enable verbose output for this script and yt-dlp.') + parser.add_argument('--cli-config', default='cli.config', help='Path to a yt-dlp configuration file. Defaults to "cli.config".') + parser.add_argument('--cleanup', action='store_true', help='After download, rename the file to include a timestamp and truncate it to 0 bytes.') + parser.add_argument('--log-file', help='Append full yt-dlp output to the specified log file.') + parser.add_argument('--yt-dlp-path', default='yt-dlp', help='Path to the yt-dlp executable. Defaults to "yt-dlp" in PATH.') + parser.add_argument('--extra-ytdlp-args', help='A string of extra command-line arguments to pass to yt-dlp.') + parser.add_argument('--downloader', help='Name of the external downloader to use (e.g., "aria2c", "native").') + parser.add_argument('--downloader-args', help='Arguments to pass to the external downloader (e.g., "aria2c:-x 8").') + parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.') + return parser + +def main_download(args): + """Main logic for the 'download' command.""" + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + if args.pause > 0: + logger.info(f"Pausing for {args.pause} seconds...") + time.sleep(args.pause) + + info_json_content = "" + input_source_name = "" + if args.load_info_json: + info_json_content = args.load_info_json.read() + input_source_name = args.load_info_json.name + else: + info_json_content = sys.stdin.read() + input_source_name = "stdin" + + if not info_json_content.strip(): + logger.error(f"Failed to read info.json from {input_source_name}. Input is empty.") + return 1 + + try: + info_data = json.loads(info_json_content) + logger.info(f"Successfully loaded info.json from {input_source_name}.") + except json.JSONDecodeError: + logger.error(f"Failed to parse info.json from {input_source_name}. Is the input valid JSON?") + return 1 + + if args.save_info_json_dir: + try: + video_id = info_data.get('id', 'unknown_video_id') + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + filename = f"{timestamp}-{video_id}-info.json" + output_path = os.path.join(args.save_info_json_dir, filename) + os.makedirs(args.save_info_json_dir, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(info_data, f, indent=2) + logger.info(f"Saved info.json to {output_path}") + except Exception as e: + logger.error(f"Failed to save info.json: {e}") + + # Determine proxy to use + proxy_url = args.proxy + if not proxy_url: + proxy_url = info_data.get('_proxy_url') + if proxy_url: + logger.info(f"Using proxy from info.json: {proxy_url}") + + if proxy_url and args.proxy_rename: + rename_rule = args.proxy_rename + # The user's command line might include quotes that are preserved by shlex. + # Strip them to get the raw rule. + rename_rule = rename_rule.strip("'\"") + if rename_rule.startswith('s/') and rename_rule.count('/') >= 2: + try: + parts = rename_rule.split('/') + pattern = parts[1] + replacement = parts[2] + original_proxy = proxy_url + proxy_url = re.sub(pattern, replacement, proxy_url) + logger.info(f"Renamed proxy URL from '{original_proxy}' to '{proxy_url}' using rule '{rename_rule}'") + except re.error as e: + logger.error(f"Invalid regex in --proxy-rename: {e}") + return 1 + except IndexError: + logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/") + return 1 + else: + logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/") + return 1 + + # yt-dlp needs to load the info.json from a file + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', encoding='utf-8') as tmp: + json.dump(info_data, tmp) + info_json_path = tmp.name + + logger.debug(f"Temporarily saved info.json to {info_json_path}") + + downloaded_filepath = None + return_code = 1 # Default to error + + try: + # Create output directory if it doesn't exist + os.makedirs(args.output_dir, exist_ok=True) + output_template = os.path.join(args.output_dir, '%(title)s [%(id)s].f%(format_id)s.%(ext)s') + + cmd = [ + args.yt_dlp_path, + '--load-info-json', info_json_path, + '-f', args.format, + '-o', output_template, + '--print', 'filename', + ] + + if args.extra_ytdlp_args: + cmd.extend(shlex.split(args.extra_ytdlp_args)) + + if args.downloader: + cmd.extend(['--downloader', args.downloader]) + if args.downloader_args: + cmd.extend(['--downloader-args', args.downloader_args]) + if args.merge_output_format: + cmd.extend(['--merge-output-format', args.merge_output_format]) + + if args.download_continue: + cmd.extend(['--continue', '--part']) + + if os.path.exists(args.cli_config): + logger.info(f"Using config file: {args.cli_config}") + cmd.extend(['--config-location', args.cli_config]) + else: + logger.info(f"Config file '{args.cli_config}' not found. Using yt-dlp defaults.") + + if args.print_traffic: + cmd.append('--print-traffic') + cmd.append('--no-progress') + else: + cmd.append('--progress') + + if args.verbose: + cmd.append('--verbose') + + if proxy_url: + cmd.extend(['--proxy', proxy_url]) + + # Determine if we need to capture output. + capture_output = args.cleanup or args.log_file or args.print_traffic + + if capture_output and not args.print_traffic: + logger.info("Note: --cleanup or --log-file requires capturing output, which may affect progress bar display.") + + logger.info(f"Executing yt-dlp command for format '{args.format}'") + + # Construct a display version of the command for logging + display_cmd_str = ' '.join(f"'{arg}'" if ' ' in arg else arg for arg in cmd) + if os.path.exists(args.cli_config): + try: + with open(args.cli_config, 'r', encoding='utf-8') as f: + config_contents = ' '.join(f.read().split()) + if config_contents: + logger.info(f"cli.config contents: {config_contents}") + except IOError as e: + logger.warning(f"Could not read config file {args.cli_config}: {e}") + + logger.info(f"Full command: {display_cmd_str}") + + if capture_output: + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='utf-8') + + log_f = None + if args.log_file: + try: + log_f = open(args.log_file, 'a', encoding='utf-8') + log_f.write(f"\n--- Log entry: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ---\n") + log_f.write(f"Command: {' '.join(cmd)}\n\n") + except IOError as e: + logger.error(f"Failed to open log file {args.log_file}: {e}") + + stdout_data, stderr_data = process.communicate() + return_code = process.returncode + + # Write captured output to terminal and log file + if stdout_data: + sys.stdout.write(stdout_data) + sys.stdout.flush() + if log_f: + for line in stdout_data.splitlines(True): + log_f.write(f"[stdout] {line}") + + if stderr_data: + sys.stderr.write(stderr_data) + sys.stderr.flush() + if log_f: + for line in stderr_data.splitlines(True): + log_f.write(f"[stderr] {line}") + + stdout_lines = stdout_data.splitlines() if stdout_data else [] + + if log_f: + log_f.write(f"\n--- End log entry (yt-dlp exit code: {return_code}) ---\n") + log_f.close() + + for line in reversed(stdout_lines): + if line and os.path.exists(line): + downloaded_filepath = line + logger.info(f"Detected downloaded file: {downloaded_filepath}") + break + else: + # Original behavior: progress bar direct to terminal, no capture + process = subprocess.Popen(cmd) + process.wait() + return_code = process.returncode + + if return_code != 0: + logger.error(f"yt-dlp exited with error code {return_code}") + else: + logger.info("yt-dlp command completed successfully.") + + except Exception as e: + logger.exception(f"An unexpected error occurred: {e}") + return 1 + finally: + # Clean up the temporary file + if os.path.exists(info_json_path): + os.unlink(info_json_path) + logger.debug(f"Removed temporary file {info_json_path}") + + # Cleanup phase + if args.cleanup: + if downloaded_filepath and os.path.exists(downloaded_filepath): + try: + logger.info(f"Cleanup: Renaming and truncating '{downloaded_filepath}'") + + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + + directory, original_filename = os.path.split(downloaded_filepath) + filename_base, filename_ext = os.path.splitext(original_filename) + + # New name format is [base]_[timestamp][ext].empty + new_filename = f"{filename_base}_{timestamp}{filename_ext}.empty" + new_filepath = os.path.join(directory, new_filename) + + os.rename(downloaded_filepath, new_filepath) + logger.info(f"Renamed to '{new_filepath}'") + + with open(new_filepath, 'w') as f: + pass + logger.info(f"Truncated '{new_filepath}' to 0 bytes.") + + except Exception as e: + logger.error(f"Cleanup failed: {e}") + return 1 + else: + logger.warning("Cleanup requested, but no downloaded file was found. Skipping cleanup.") + + return return_code diff --git a/ytops_client/get_info_tool.py b/ytops_client/get_info_tool.py new file mode 100644 index 0000000..44195d4 --- /dev/null +++ b/ytops_client/get_info_tool.py @@ -0,0 +1,473 @@ +#!/usr/bin/env python3 +""" +Tool to get info.json from the Thrift service. +""" + +import argparse +import json +import os +import re +import sys +import logging +import codecs +from datetime import datetime +from typing import Dict, Any, Optional + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +# Note: The CLI entrypoint will configure the root logger. +# We get our own logger here for namespacing. +logger = logging.getLogger('get_info_tool') + +# Import Thrift modules +# Add project's thrift gen_py path to allow importing 'pangramia' +script_dir = os.path.dirname(os.path.abspath(__file__)) +project_root = os.path.abspath(os.path.join(script_dir, '..')) +sys.path.insert(0, os.path.join(project_root, 'thrift_model', 'gen_py')) +from thrift.transport import TTransport +from pangramia.yt.common.ttypes import TokenUpdateMode +from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException +from yt_ops_services.client_utils import get_thrift_client +from ytops_client.request_params_help import REQUEST_PARAMS_HELP_STRING + + +def get_video_id(url: str) -> str: + """Extracts a YouTube video ID from a URL.""" + # For URLs like https://www.youtube.com/watch?v=VIDEO_ID + match = re.search(r"v=([0-9A-Za-z_-]{11})", url) + if match: + return match.group(1) + # For URLs like https://youtu.be/VIDEO_ID + match = re.search(r"youtu\.be\/([0-9A-Za-z_-]{11})", url) + if match: + return match.group(1) + # For plain video IDs + if re.fullmatch(r'[0-9A-Za-z_-]{11}', url): + return url + return "unknown_video_id" + + +def parse_key_value_params(params_str: str) -> Dict[str, Any]: + """Parses a comma-separated string of key=value pairs into a nested dict.""" + params = {} + if not params_str: + return params + for pair in params_str.split(','): + if '=' not in pair: + logger.warning(f"Skipping malformed parameter pair: {pair}") + continue + key, value_str = pair.split('=', 1) + keys = key.strip().split('.') + + # Try to parse value as JSON primitive, otherwise treat as string + try: + # Don't parse if it's quoted, treat as string + if (value_str.startswith('"') and value_str.endswith('"')) or \ + (value_str.startswith("'") and value_str.endswith("'")): + value = value_str[1:-1] + else: + value = json.loads(value_str) + except json.JSONDecodeError: + value = value_str + + d = params + for k in keys[:-1]: + if k not in d or not isinstance(d[k], dict): + d[k] = {} + d = d[k] + d[keys[-1]] = value + return params + + +def add_get_info_parser(subparsers): + """Add the parser for the 'get-info' command.""" + parser = subparsers.add_parser( + 'get-info', + description='Get info.json from Thrift service', + formatter_class=argparse.RawTextHelpFormatter, + help='Get info.json from the Thrift service.' + ) + parser.add_argument('url', help='YouTube URL or video ID') + parser.add_argument('--host', default='127.0.0.1', help="Thrift server host. Using 127.0.0.1 avoids harmless connection errors when the local Envoy proxy only listens on IPv4.") + parser.add_argument('--port', type=int, default=9080, help='Thrift server port') + parser.add_argument('--auth-host', help='Thrift server host (overrides --host).') + parser.add_argument('--auth-port', type=int, help='Thrift server port (overrides --port).') + parser.add_argument('--profile', default='default_profile', help='The profile name (accountId) to use for the request.') + parser.add_argument('--client', help='''Specific client to use. Overrides server default. +Available clients: + web, web_safari, web_embedded, web_music, web_creator, mweb + android, android_music, android_creator, android_vr + ios, ios_music, ios_creator + tv, tv_simply, tv_embedded + +Append "_camoufox" to any client name (e.g., "web_camoufox") to force +the browser-based generation strategy.''') + parser.add_argument('--output', help='Output file path for the info.json. If not provided, prints to stdout.') + parser.add_argument('--output-auto', action='store_true', help='Automatically generate output filename for info.json and invocation data. Format: DATETIME-CLIENT-VIDEOID-info.json') + parser.add_argument('--output-auto-url-only', action='store_true', help='Automatically generate output filename for info.json (format: VIDEOID-info.json) and also save a copy to latest-info.json.') + parser.add_argument('--output-auto-suffix', help='Suffix to add to the filename before "-info.json" when using --output-auto or --output-auto-url-only. E.g., "-cycle1".') + parser.add_argument('--log-file-auto', action='store_true', help='Automatically generate a log filename and save all script logs to it. Format: VIDEOID-DATETIME.log') + parser.add_argument('--machine-id', help='Identifier for the client machine. Defaults to hostname.') + parser.add_argument('--worker-id', help='Identifier for a worker process. Used for naming files with --save-latest.') + parser.add_argument('--save-latest', action='store_true', help='Save a copy of the info.json to latest-info.json or [worker-id]-latest-info.json. This is implied by --output-auto-url-only.') + parser.add_argument('--assigned-proxy-url', help='A specific proxy URL to use for the request, overriding the server\'s proxy pool logic.') + parser.add_argument('--proxy-rename', help='Apply sed-style regex substitution to the assigned proxy URL. Format: s/pattern/replacement/') + parser.add_argument('--print-proxy', action='store_true', help='Print the proxy used for the request to stderr.') + parser.add_argument('--verbose', action='store_true', help='Enable verbose output') + parser.add_argument('--log-return', action='store_true', help='Log the full summary of the thrift response to stderr, including detailed logs.\nThis is a convenience flag that implies --show-prefetch-log, --show-nodejs-log, and --show-ytdlp-log.') + parser.add_argument('--show-prefetch-log', action='store_true', help='Print the curl pre-fetch log from the server response.') + parser.add_argument('--show-nodejs-log', action='store_true', help='Print the Node.js debug log from the server response.') + parser.add_argument('--show-ytdlp-log', action='store_true', help='Print the yt-dlp debug log from the server response.') + parser.add_argument('--direct', action='store_true', help='Use the direct yt-dlp info.json generation method, bypassing Node.js token generation.') + parser.add_argument('--print-info-out', action='store_true', help='Print the final info.json to stdout. By default, output is suppressed unless writing to a file.') + parser.add_argument('--request-params-json', help=REQUEST_PARAMS_HELP_STRING + '\nCan also be a comma-separated string of key=value pairs (e.g., "caching_policy.mode=force_refresh").') + parser.add_argument('--force-renew', help='Comma-separated list of items to force-renew: cookies, visitor_id, po_token, nsig_cache, all.') + return parser + +def main_get_info(args): + """Main logic for the 'get-info' command.""" + exit_code = 0 + + # Set log level + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + if args.log_file_auto: + video_id = get_video_id(args.url) + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + log_filename = f"{video_id}-{timestamp}.log" + + # Get root logger to add file handler + root_logger = logging.getLogger() + file_handler = logging.FileHandler(log_filename) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + file_handler.setFormatter(formatter) + root_logger.addHandler(file_handler) + + logger.info(f"Logging to file: {log_filename}") + + transport = None + try: + # Determine host and port, giving precedence to --auth-* args + host = args.auth_host or args.host + port = args.auth_port or args.port + + # Create Thrift client + client, transport = get_thrift_client(host, port) + + # Get token data, which includes the info.json + if args.direct: + logger.info(f"Requesting info.json for URL '{args.url}' using DIRECT method.") + if args.client: + logger.info(f"Requesting to use specific client(s): {args.client}") + else: + logger.info("No specific client requested, server will let yt-dlp decide.") + token_data = client.getInfoJsonDirect(url=args.url, clients=args.client) + else: + logger.info(f"Requesting info.json for URL '{args.url}' using profile '{args.profile}'") + + # Prepare arguments for the Thrift call + machine_id = args.machine_id + if not machine_id: + import socket + machine_id = socket.gethostname() + logger.info(f"No machine ID provided, using hostname: {machine_id}") + + request_params = {} + if args.request_params_json: + try: + request_params = json.loads(args.request_params_json) + except json.JSONDecodeError: + logger.info("Could not parse --request-params-json as JSON, trying as key-value string.") + request_params = parse_key_value_params(args.request_params_json) + + if args.force_renew: + items_to_renew = [item.strip() for item in args.force_renew.split(',')] + request_params['force_renew'] = items_to_renew + logger.info(f"Requesting force renew for: {items_to_renew}") + + if args.verbose: + # Add verbose flag for yt-dlp on the server + ytdlp_params = request_params.setdefault('ytdlp_params', {}) + ytdlp_params['verbose'] = True + logger.info("Verbose mode enabled, requesting verbose yt-dlp logs from server.") + + thrift_args = { + 'accountId': args.profile, + 'updateType': TokenUpdateMode.AUTO, + 'url': args.url, + 'clients': args.client, + 'machineId': machine_id, + 'airflowLogContext': None, + 'requestParamsJson': json.dumps(request_params) if request_params else None, + 'assignedProxyUrl': args.assigned_proxy_url + } + + # Handle proxy renaming + assigned_proxy = args.assigned_proxy_url + if assigned_proxy and args.proxy_rename: + rename_rule = args.proxy_rename.strip("'\"") + if rename_rule.startswith('s/') and rename_rule.count('/') >= 2: + try: + parts = rename_rule.split('/') + pattern = parts[1] + replacement = parts[2] + original_proxy = assigned_proxy + assigned_proxy = re.sub(pattern, replacement, assigned_proxy) + logger.info(f"Renamed proxy URL from '{original_proxy}' to '{assigned_proxy}' using rule '{rename_rule}'") + except re.error as e: + logger.error(f"Invalid regex in --proxy-rename: {e}") + return 1 + except IndexError: + logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/") + return 1 + else: + logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/") + return 1 + thrift_args['assignedProxyUrl'] = assigned_proxy + + if args.client: + logger.info(f"Requesting to use specific client: {args.client}") + else: + logger.info("No specific client requested, server will use its default.") + + token_data = client.getOrRefreshToken(**thrift_args) + + if args.print_proxy: + if hasattr(token_data, 'socks') and token_data.socks: + print(f"Proxy used: {token_data.socks}", file=sys.stderr) + else: + print("Proxy information not available in response.", file=sys.stderr) + + if not token_data or not hasattr(token_data, 'infoJson') or not token_data.infoJson: + logger.error("Server did not return valid info.json data.") + print("Error: Server did not return valid info.json data.", file=sys.stderr) + return 1 + + info_json_str = token_data.infoJson + + # On success, print summary info to stderr for visibility. + # This provides immediate feedback without interfering with piped stdout. + if hasattr(token_data, 'serverVersionInfo') and token_data.serverVersionInfo: + # Filter out the default params line as requested + filtered_info = '\n'.join( + line for line in token_data.serverVersionInfo.split('\n') + if 'Default yt-dlp CLI params:' not in line + ) + print(f"\n--- Server Version Info ---\n{filtered_info}", file=sys.stderr) + if hasattr(token_data, 'requestSummary') and token_data.requestSummary: + try: + summary_data = json.loads(token_data.requestSummary) + print(f"\n--- Request Summary ---\n{summary_data.get('summary', token_data.requestSummary)}", file=sys.stderr) + except json.JSONDecodeError: + # Fallback for old format or non-JSON summary + print(f"\n--- Request Summary ---\n{token_data.requestSummary}", file=sys.stderr) + + # Print detailed logs only if explicitly requested + if hasattr(token_data, 'requestSummary') and token_data.requestSummary: + try: + summary_data = json.loads(token_data.requestSummary) + if args.show_prefetch_log or args.log_return: + print("\n--- Prefetch Log ---", file=sys.stderr) + print(summary_data.get('prefetch_log', 'Not available.'), file=sys.stderr) + if args.show_nodejs_log or args.log_return: + print("\n--- Node.js Log ---", file=sys.stderr) + print(summary_data.get('nodejs_log', 'Not available.'), file=sys.stderr) + if args.show_ytdlp_log or args.log_return: + print("\n--- yt-dlp Log ---", file=sys.stderr) + print(summary_data.get('ytdlp_log', 'Not available.'), file=sys.stderr) + except json.JSONDecodeError: + pass # Fallback already handled above + if hasattr(token_data, 'communicationLogPaths') and token_data.communicationLogPaths: + logger.info("--- Communication Log Paths ---") + for log_path in token_data.communicationLogPaths: + logger.info(f" - {log_path}") + + # Check if the returned info.json is an error report + try: + info_data = json.loads(info_json_str) + if hasattr(token_data, 'socks') and token_data.socks: + info_data['_proxy_url'] = token_data.socks + if isinstance(info_data, dict) and 'error' in info_data: + error_code = info_data.get('errorCode', 'N/A') + error_message = info_data.get('message', info_data.get('error', 'Unknown error')) + logger.error(f"Server returned an error in info.json (Code: {error_code}): {error_message}") + print(f"Error from server (Code: {error_code}): {error_message}", file=sys.stderr) + # Optionally print the full error JSON + if args.verbose: + print(json.dumps(info_data, indent=2), file=sys.stderr) + exit_code = 1 + except json.JSONDecodeError: + logger.error(f"Failed to parse info.json from server: {info_json_str[:200]}...") + print("Error: Failed to parse the info.json response from the server.", file=sys.stderr) + return 1 + + logger.info(f"Successfully retrieved info.json ({len(info_json_str)} bytes)") + + # Save to latest-info.json if requested, or if using --output-auto-url-only for convenience + if args.save_latest or args.output_auto_url_only: + base_latest_filename = f"{args.worker_id}-latest" if args.worker_id else "latest" + latest_info_filename = f"{base_latest_filename}-info.json" + latest_proxy_filename = f"{base_latest_filename}-proxy.txt" + + try: + with open(latest_info_filename, 'w', encoding='utf-8') as f: + json.dump(info_data, f, indent=2) + logger.info(f"Wrote info.json to {latest_info_filename}") + print(f"Successfully saved info.json to {latest_info_filename}", file=sys.stderr) + except IOError as e: + logger.error(f"Failed to write to {latest_info_filename}: {e}") + print(f"Error: Failed to write to {latest_info_filename}: {e}", file=sys.stderr) + + if hasattr(token_data, 'socks') and token_data.socks: + try: + with open(latest_proxy_filename, 'w', encoding='utf-8') as f: + f.write(token_data.socks + '\n') + logger.info(f"Wrote proxy to {latest_proxy_filename}") + print(f"Successfully saved proxy to {latest_proxy_filename}", file=sys.stderr) + except IOError as e: + logger.error(f"Failed to write to {latest_proxy_filename}: {e}") + print(f"Error: Failed to write to {latest_proxy_filename}: {e}", file=sys.stderr) + + # Determine output file path if auto-naming is used + output_file = args.output + if args.output_auto or args.output_auto_url_only: + video_id = get_video_id(args.url) + suffix = args.output_auto_suffix or "" + if args.output_auto: + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + client_id = args.client or args.profile + base_filename = f"{timestamp}-{client_id}-{video_id}{suffix}" + output_file = f"{base_filename}-info.json" + + # Save invocation data + invocation_filename = f"{base_filename}-invocation.json" + invocation_data = {} + for attr in ['ytdlpCommand', 'socks', 'jobId', 'url', 'requestSummary', 'communicationLogPaths']: + if hasattr(token_data, attr): + value = getattr(token_data, attr) + if value: + invocation_data[attr] = value + + if hasattr(token_data, 'cookiesBlob') and token_data.cookiesBlob: + invocation_data['cookiesBlob'] = f"present, {len(token_data.cookiesBlob)} bytes" + else: + invocation_data['cookiesBlob'] = "not present" + + try: + with open(invocation_filename, 'w', encoding='utf-8') as f: + json.dump(invocation_data, f, indent=2) + logger.info(f"Wrote invocation data to {invocation_filename}") + except IOError as e: + logger.error(f"Failed to write invocation data to {invocation_filename}: {e}") + + else: # args.output_auto_url_only + output_file = f"{video_id}{suffix}-info.json" + + # Write to output file if specified + if output_file: + try: + # Ensure the output directory exists before writing the file + output_dir = os.path.dirname(output_file) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + with open(output_file, 'w', encoding='utf-8') as f: + # Pretty-print the JSON to the file + json.dump(info_data, f, indent=2) + logger.info(f"Wrote info.json to {output_file}") + # Print success message to stderr to not interfere with stdout piping + print(f"Successfully saved info.json to {output_file}", file=sys.stderr) + + # If --output-auto, save invocation data + if args.output_auto: + pass # The latest-info.json logic is now handled by --save-latest + + except IOError as e: + logger.error(f"Failed to write to output file {output_file}: {e}") + print(f"Error: Failed to write to output file {output_file}: {e}", file=sys.stderr) + return 1 + + # Print the JSON to stdout if requested, to allow for piping. + if args.print_info_out: + print(json.dumps(info_data, indent=2)) + + return exit_code + except (PBServiceException, PBUserException) as e: + # Check for non-fatal age-gate errors. These are expected for certain videos + # and should not cause the entire stress test to fail. + is_age_gate_error = hasattr(e, 'errorCode') and e.errorCode == 'AGE_GATED_SIGN_IN' + + if is_age_gate_error: + logger.warning(f"Age-gated content detected for URL '{args.url}'. Treating as a non-fatal warning.") + print(f"Warning: Age-gated content detected for '{args.url}'.", file=sys.stderr) + + # To avoid breaking downstream parsers, output a valid JSON error object. + # This allows stress testers to see a 'success' (exit 0) but still know it was an age gate issue. + error_json = { + "error": "Age-gated content", + "errorCode": "AGE_GATE", + "message": "Sign in to confirm your age." + } + print(json.dumps(error_json, indent=2)) + + # We return success because this is not a system failure. + return 0 + + # Format message for better readability, ensuring newlines are handled. + message = str(e.message or '') + try: + # Attempt to decode as if it has escaped newlines (e.g., '\\n' -> '\n') + message = codecs.decode(message, 'unicode_escape') + except Exception: + # Fallback for safety, though unicode_escape is robust + message = message.replace('\\n', '\n') + + # For known user-facing errors, suppress the full traceback unless verbose is explicitly on. + # The goal is to provide a clean error message for common issues. + user_facing_errors = [ + "BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED", + "VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED", + "AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "GEO_RESTRICTED" + ] + is_user_facing_error = hasattr(e, 'errorCode') and e.errorCode in user_facing_errors + + # Only show full traceback in verbose mode AND if it's NOT a common user-facing error. + show_exc_info = args.verbose and not is_user_facing_error + + logger.error(f"A Thrift error occurred: {message}", exc_info=show_exc_info) + print(f"\n--- ERROR ---", file=sys.stderr) + print(f"{message}", file=sys.stderr) + + if hasattr(e, 'context') and e.context and (args.verbose or not is_user_facing_error): + print(f"\n--- CONTEXT ---", file=sys.stderr) + # The context is a dict from thrift. Pretty print it, handling newlines in values. + if isinstance(e.context, dict): + # Process each value to un-escape newlines for clean printing + processed_context = {} + for key, value in e.context.items(): + try: + processed_context[key] = codecs.decode(str(value), 'unicode_escape') + except Exception: + processed_context[key] = str(value).replace('\\n', '\n') + print(json.dumps(processed_context, indent=2), file=sys.stderr) + else: + # Fallback for non-dict context + print(str(e.context), file=sys.stderr) + print("\n", file=sys.stderr) + return 1 + except TTransport.TTransportException as e: + logger.error(f"Connection to server failed: {e}", exc_info=args.verbose) + print(f"Error: Connection to server at {args.host}:{args.port} failed.", file=sys.stderr) + return 1 + except Exception as e: + logger.exception(f"An unexpected error occurred: {e}") + print(f"An unexpected error occurred: {e}", file=sys.stderr) + return 1 + finally: + if transport and transport.isOpen(): + transport.close() + logger.info("Thrift connection closed.") diff --git a/ytops_client/list_formats_tool.py b/ytops_client/list_formats_tool.py new file mode 100644 index 0000000..811f47a --- /dev/null +++ b/ytops_client/list_formats_tool.py @@ -0,0 +1,228 @@ +""" +Tool to list available formats from a yt-dlp info.json file. +""" + +import sys +import json +import argparse +import re +from urllib.parse import urlparse, parse_qs +from datetime import datetime, timezone + +def format_size(b): + """Format size in bytes to human-readable string.""" + if b is None: + return 'N/A' + if b < 1024: + return f"{b}B" + elif b < 1024**2: + return f"{b/1024:.2f}KiB" + elif b < 1024**3: + return f"{b/1024**2:.2f}MiB" + else: + return f"{b/1024**3:.2f}GiB" + +def list_formats(info_json, requested_formats_str=None, file=sys.stdout): + """Prints a table of available formats from info.json data.""" + formats = info_json.get('formats', []) + if not formats: + print("No formats found in the provided info.json.", file=file) + return + + requested_formats = [] + requested_order = {} + if requested_formats_str: + # Split by comma or slash, and filter out empty strings + requested_formats = [item for item in re.split(r'[,/]', requested_formats_str) if item] + requested_order = {fmt: i for i, fmt in enumerate(requested_formats)} + + def sort_key(f): + fid = f.get('format_id', '') + is_requested = fid in requested_order + if is_requested: + # Sort requested formats by the order they were provided + return (False, requested_order[fid]) + else: + # Sort other formats numerically by ID + return (True, int(fid) if fid.isdigit() else 999) + + sorted_formats = sorted(formats, key=sort_key) + + # Check if any requested formats were found + if requested_formats: + found_any = any(f.get('format_id') in requested_order for f in formats) + if not found_any: + print("WARNING: No format from list found.", file=sys.stderr) + + # Header + header = "{:<6} {:<7} {:<12} {:<5} {:<18} {:<18} {:<12} {:<10} {:<20} {:<17} {:<15} {:<12} {:<12} {:<12} {:<5} {:<12} {:<12} {:<12} {:<12} {:<12}".format( + "ID", "EXT", "RESOLUTION", "FPS", "VCODEC", "ACODEC", "FILESIZE", "TBR", "URL (path)", "EXPIRE (UTC)", "IP", "ID_TOKEN", "SESS_TOKEN", "EI_TOKEN", "GIR", "BUI_TOKEN", "POT_TOKEN", "MT_TOKEN", "SIG", "LSIG" + ) + print(header, file=file) + print("-" * len(header), file=file) + + for f in sorted_formats: + format_id = f.get('format_id', 'N/A') + ext = f.get('ext', 'N/A') + + resolution = f.get('resolution') + if not resolution: + if 'width' in f and f['width'] is not None: + resolution = f"{f['width']}x{f['height']}" + else: + resolution = 'audio only' + + fps = f.get('fps', '') + vcodec = f.get('vcodec', 'none') + acodec = f.get('acodec', 'none') + filesize = f.get('filesize') or f.get('filesize_approx') + tbr = f.get('tbr') + + display_id = f"*{format_id}" if format_id in requested_order else format_id + + url = f.get('url', '') + partial_url, expire_date, ip, id_token_short, sess_token_short, ei_token_short, gir, bui_token_short, pot_token_short, mt_token_short, sig_short, lsig_short = ('N/A',) * 12 + if url: + parsed = urlparse(url) + query_params = parse_qs(parsed.query) + + path_and_query = parsed.path + if parsed.query: + path_and_query += '?' + parsed.query + + if len(path_and_query) > 18: + partial_url = path_and_query[:8] + '...' + path_and_query[-7:] + else: + partial_url = path_and_query + + expire_ts = query_params.get('expire', [None])[0] + if expire_ts: + try: + expire_date = datetime.fromtimestamp(int(expire_ts), timezone.utc).strftime('%m-%d %H:%M:%S') + except (ValueError, TypeError): + expire_date = 'Invalid' + + ip = query_params.get('ip', ['N/A'])[0] + + id_token = query_params.get('id', [None])[0] + if id_token and len(id_token) > 12: + id_token_short = id_token[:6] + '..' + id_token[-4:] + elif id_token: + id_token_short = id_token + + sess_token = query_params.get('n', [None])[0] + if sess_token and len(sess_token) > 12: + sess_token_short = sess_token[:6] + '..' + sess_token[-4:] + elif sess_token: + sess_token_short = sess_token + + ei_token = query_params.get('ei', [None])[0] + if ei_token and len(ei_token) > 12: + ei_token_short = ei_token[:6] + '..' + ei_token[-4:] + elif ei_token: + ei_token_short = ei_token + + gir = query_params.get('gir', ['N/A'])[0] + + bui_token = query_params.get('bui', [None])[0] + if bui_token and len(bui_token) > 12: + bui_token_short = bui_token[:6] + '..' + bui_token[-4:] + elif bui_token: + bui_token_short = bui_token + + pot_token = query_params.get('pot', [None])[0] + if pot_token and len(pot_token) > 12: + pot_token_short = pot_token[:6] + '..' + pot_token[-4:] + elif pot_token: + pot_token_short = pot_token + + mt_token = query_params.get('mt', [None])[0] + # mt is often just a timestamp, don't shorten unless it's a long hash + if mt_token and len(mt_token) > 12: + mt_token_short = mt_token[:6] + '..' + mt_token[-4:] + elif mt_token: + mt_token_short = mt_token + + sig = query_params.get('sig', [None])[0] + if sig and len(sig) > 12: + sig_short = sig[:6] + '..' + sig[-4:] + elif sig: + sig_short = sig + + lsig = query_params.get('lsig', [None])[0] + if lsig and len(lsig) > 12: + lsig_short = lsig[:6] + '..' + lsig[-4:] + elif lsig: + lsig_short = lsig + + print("{:<6} {:<7} {:<12} {:<5} {:<18} {:<18} {:<12} {:<10} {:<20} {:<17} {:<15} {:<12} {:<12} {:<12} {:<5} {:<12} {:<12} {:<12} {:<12} {:<12}".format( + str(display_id), + str(ext), + str(resolution), + str(fps) if fps else '', + str(vcodec)[:18], + str(acodec)[:18], + format_size(filesize), + f"{tbr:.0f}k" if tbr else 'N/A', + partial_url, + expire_date, + ip, + id_token_short, + sess_token_short, + ei_token_short, + gir, + bui_token_short, + pot_token_short, + mt_token_short, + sig_short, + lsig_short + ), file=file) + +def add_list_formats_parser(subparsers): + """Add the parser for the 'list-formats' command.""" + parser = subparsers.add_parser( + 'list-formats', + description="List available formats from a yt-dlp info.json file.", + formatter_class=argparse.RawTextHelpFormatter, + help="List available formats from a yt-dlp info.json file." + ) + parser.add_argument( + '--load-info-json', + type=argparse.FileType('r', encoding='utf-8'), + default=sys.stdin, + help="Path to the info.json file. Reads from stdin if not provided." + ) + parser.add_argument( + '-f', '--formats', + help='Comma or slash-separated list of format IDs to highlight and prioritize (e.g., "18,140,299/298").' + ) + parser.add_argument( + '-p', '--pass-through', + action='store_true', + help='Pass the input JSON through to stdout, printing the format list to stderr.' + ) + return parser + +def main_list_formats(args): + """Main logic for the 'list-formats' command.""" + try: + # Read the whole content to allow passing it through + info_json_content = args.load_info_json.read() + info_data = json.loads(info_json_content) + + # Determine output stream for the format list + output_stream = sys.stderr if args.pass_through else sys.stdout + list_formats(info_data, args.formats, file=output_stream) + + # If pass-through is enabled, print the original JSON to stdout + if args.pass_through: + # Use end='' because the read content likely includes a trailing newline + print(info_json_content, end='') + + return 0 + except json.JSONDecodeError: + print("Error: Invalid JSON provided.", file=sys.stderr) + return 1 + except Exception as e: + print(f"An unexpected error occurred: {e}", file=sys.stderr) + return 1 diff --git a/ytops_client/request_params_help.py b/ytops_client/request_params_help.py new file mode 100644 index 0000000..5dd3dae --- /dev/null +++ b/ytops_client/request_params_help.py @@ -0,0 +1,48 @@ +# Using a separate file for this long help message to keep the main script clean. +# It's imported by client tools that use the --request-params-json argument. + +REQUEST_PARAMS_HELP_STRING = """JSON string with per-request parameters to override server defaults. +Example of a full configuration JSON showing default values (use single quotes to wrap it): +'{ + "_comment": "This JSON object allows overriding server-side defaults for a single request.", + "cookies_file_path": "/path/to/your/cookies.txt", + + "context_reuse_policy": { + "enabled": true, + "max_age_seconds": 86400, + "reuse_visitor_id": true, + "reuse_cookies": true + }, + "_comment_context_reuse_policy": "Controls how the server reuses session context (cookies, visitor ID) from the account's previous successful request.", + "_comment_reuse_visitor_id": "If true, reuses the visitor ID from the last session to maintain a consistent identity to YouTube. This is automatically disabled for TV clients to avoid bot detection.", + + "ytdlp_params": { + "use_curl_prefetch": false, + "skip_cache": false, + "visitor_id_override_enabled": true, + "extractor_args": { + "youtubepot-bgutilhttp": { + "base_url": "http://172.17.0.1:4416" + }, + "youtube": { + "pot_trace": "true", + "formats": "duplicate", + "player_js_version": "actual" + }, + "youtubepot-webpo": { + "bind_to_visitor_id": "true" + } + } + }, + "_comment_ytdlp_params": "Parameters passed directly to the yt-dlp wrapper for info.json generation.", + "_comment_visitor_id_override_enabled": "If true (default), the server validates the visitor ID from the token generator and creates a new one if it is invalid. Set to false to force using the provided visitor ID without validation, which is useful for debugging.", + "_comment_extractor_args": "Directly override yt-dlp extractor arguments. To use BGUtils in script mode, replace 'youtubepot-bgutilhttp' with 'youtubepot-bgutilscript'. The script path is '/opt/bgutil-ytdlp-pot-provider-server/build/generate_once.js'. To disable any explicit provider (like '--bgutils-mode none' on the server), remove both 'youtubepot-bgutilhttp' and 'youtubepot-bgutilscript' keys.", + + "session_params": { + "lang": "en-US", + "location": "US", + "deviceCategory": "MOBILE", + "user_agent": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)" + }, + "_comment_session_params": "Parameters for the token generation session (primarily for Node.js)." +}'""" diff --git a/ytops_client/stress_formats_tool.py b/ytops_client/stress_formats_tool.py new file mode 100644 index 0000000..f45dbee --- /dev/null +++ b/ytops_client/stress_formats_tool.py @@ -0,0 +1,788 @@ +#!/usr/bin/env python3 +""" +Tool to stress-test video format download URLs from an info.json. +""" + +import argparse +import collections +import concurrent.futures +import json +import logging +import os +import random +import re +import shlex +import signal +import subprocess +import sys +import threading +import time +from datetime import datetime, timezone +from pathlib import Path +from urllib.parse import urlparse, parse_qs + +# Configure logging +logger = logging.getLogger('stress_formats_tool') + + +def get_video_id(url: str) -> str: + """Extracts a YouTube video ID from a URL.""" + # For URLs like https://www.youtube.com/watch?v=VIDEO_ID + match = re.search(r"v=([0-9A-Za-z_-]{11})", url) + if match: + return match.group(1) + # For URLs like https://youtu.be/VIDEO_ID + match = re.search(r"youtu\.be\/([0-9A-Za-z_-]{11})", url) + if match: + return match.group(1) + # For plain video IDs + if re.fullmatch(r'[0-9A-Za-z_-]{11}', url): + return url + return "unknown_video_id" + + +def get_display_name(path_or_url): + """Returns a clean name for logging, either a filename or a video ID.""" + if isinstance(path_or_url, Path): + return path_or_url.name + + path_str = str(path_or_url) + video_id = get_video_id(path_str) + if video_id != "unknown_video_id": + return video_id + + # Fallback for file paths as strings or weird URLs + return Path(path_str).name + + +def format_size(b): + """Format size in bytes to human-readable string.""" + if b is None: + return 'N/A' + if b < 1024: + return f"{b}B" + elif b < 1024**2: + return f"{b/1024:.2f}KiB" + elif b < 1024**3: + return f"{b/1024**2:.2f}MiB" + else: + return f"{b/1024**3:.2f}GiB" + + +class StatsTracker: + """Tracks and reports statistics for the stress test.""" + def __init__(self, stats_file=None): + self.events = [] + self.start_time = time.time() + self.lock = threading.Lock() + self.stats_file_path = stats_file + self.stats_file_handle = None + if self.stats_file_path: + try: + self.stats_file_handle = open(self.stats_file_path, 'a', encoding='utf-8') + except IOError as e: + logger.error(f"Could not open stats file {self.stats_file_path}: {e}") + + def log_event(self, event_data): + """Log a download attempt event.""" + with self.lock: + event_data['timestamp'] = datetime.now().isoformat() + self.events.append(event_data) + if self.stats_file_handle: + self.stats_file_handle.write(json.dumps(event_data) + '\n') + self.stats_file_handle.flush() + + def close(self): + """Close the stats file.""" + if self.stats_file_handle: + self.stats_file_handle.close() + + def print_summary(self): + """Print a summary of the test run.""" + with self.lock: + if not self.events: + logger.info("No events were recorded.") + return + + duration = time.time() - self.start_time + + # Separate events by type + fetch_events = [e for e in self.events if e.get('type') == 'fetch'] + download_events = [e for e in self.events if e.get('type') != 'fetch'] # Default to download for old events + + logger.info("\n--- Test Summary ---") + logger.info(f"Total duration: {duration:.2f} seconds") + + if fetch_events: + total_fetches = len(fetch_events) + successful_fetches = sum(1 for e in fetch_events if e['success']) + failed_fetches = total_fetches - successful_fetches + logger.info("\n--- Fetch Summary ---") + logger.info(f"Total info.json fetch attempts: {total_fetches}") + logger.info(f" - Successful: {successful_fetches}") + logger.info(f" - Failed: {failed_fetches}") + if total_fetches > 0: + success_rate = (successful_fetches / total_fetches) * 100 + logger.info(f"Success rate: {success_rate:.2f}%") + if failed_fetches > 0: + error_counts = collections.Counter(e.get('error_type', 'Unknown') for e in fetch_events if not e['success']) + logger.info("Failure breakdown:") + for error_type, count in sorted(error_counts.items()): + logger.info(f" - {error_type}: {count}") + + if download_events: + total_attempts = len(download_events) + successes = sum(1 for e in download_events if e['success']) + failures = total_attempts - successes + + logger.info("\n--- Download Summary ---") + logger.info(f"Total download attempts: {total_attempts}") + logger.info(f" - Successful: {successes}") + logger.info(f" - Failed: {failures}") + + if total_attempts > 0: + success_rate = (successes / total_attempts) * 100 + logger.info(f"Success rate: {success_rate:.2f}%") + + if duration > 1 and total_attempts > 0: + dpm = (total_attempts / duration) * 60 + logger.info(f"Attempt rate: {dpm:.2f} attempts/minute") + + # Download volume stats + total_bytes = sum(e.get('downloaded_bytes', 0) for e in download_events if e['success']) + if total_bytes > 0: + logger.info(f"Total data downloaded: {format_size(total_bytes)}") + if duration > 1: + bytes_per_second = total_bytes / duration + gb_per_hour = (bytes_per_second * 3600) / (1024**3) + gb_per_day = gb_per_hour * 24 + logger.info(f"Download rate: {gb_per_hour:.3f} GB/hour ({gb_per_day:.3f} GB/day)") + + if failures > 0: + error_counts = collections.Counter(e.get('error_type', 'Unknown') for e in download_events if not e['success']) + logger.info("Failure breakdown:") + for error_type, count in sorted(error_counts.items()): + logger.info(f" - {error_type}: {count}") + + logger.info("--------------------") + +def print_banner(args, info_jsons=None, urls=None): + """Prints a summary of the test configuration.""" + logger.info("--- Stress Test Configuration ---") + if args.urls_file: + if args.fetch_only: + logger.info(f"Mode: Fetch-only. Generating info.json files from URL list.") + else: + logger.info(f"Mode: Full-stack test from URL list.") + logger.info(f"URL file: {args.urls_file} ({len(urls)} URLs)") + logger.info(f"Workers: {args.workers}") + logger.info(f"Info.json command: {args.info_json_gen_cmd}") + if args.info_json_gen_cmd_alt and args.alt_cmd_every_n > 0: + logger.info(f"Alternate command (every {args.alt_cmd_every_n} URLs): {args.info_json_gen_cmd_alt}") + if args.profile_prefix: + if args.profile_pool: + logger.info(f"Profile mode: Pool of {args.profile_pool} (prefix: {args.profile_prefix})") + elif args.profile_per_request: + logger.info(f"Profile mode: New profile per request (prefix: {args.profile_prefix})") + else: # info-json-files + logger.info(f"Mode: Download-only from static info.json files.") + if info_jsons: + logger.info(f"Files: {', '.join(str(p.name) for p in info_jsons.keys())}") + logger.info(f"Workers: {args.workers}") + + logger.info(f"Format selection: {args.format}") + logger.info(f"Sleep between cycles: {args.sleep}s") + if args.sleep_formats > 0: + logger.info(f"Sleep between formats: {args.sleep_formats}s") + if args.duration > 0: + logger.info(f"Test duration: {args.duration} minutes") + if args.max_attempts > 0: + logger.info(f"Max cycles: {args.max_attempts}") + logger.info(f"Stop on failure: {args.stop_on_failure}") + if args.stop_on_403: + logger.info(f"Stop on 403 error: True") + if args.stop_on_timeout: + logger.info(f"Stop on timeout: True") + logger.info(f"Stats file: {args.stats_file}") + if args.stats_interval > 0: + logger.info(f"Periodic stats interval: {args.stats_interval}s") + if args.format_download_args: + logger.info(f"Extra download args: {args.format_download_args}") + logger.info("Download volume: Tracking total data downloaded") + logger.info("---------------------------------") + +def add_stress_formats_parser(subparsers): + """Add the parser for the 'stress-formats' command.""" + parser = subparsers.add_parser( + 'stress-formats', + description="A simple, command-line driven stress-testing tool for basic scenarios.\nAll options are configured via flags. For more complex scenarios and advanced\nfeatures like rate limiting and client rotation, use the 'stress-policy' command.", + formatter_class=argparse.RawTextHelpFormatter, + help='Run simple, flag-driven stress tests.', + epilog=""" +Usage examples: + +# Test a format from a static info.json every 60 seconds +ytops-client stress-formats --info-json-files my_video.json -f 18 --sleep 60 + +# Test with multiple info.json files in parallel using 4 workers +ytops-client stress-formats --info-json-files "file1.json,file2.json,file3.json" -f 18 --sleep 60 --workers 4 + +# Fetch a new info.json for a URL and test a format every 5 minutes +ytops-client stress-formats --urls-file urls.txt --info-json-gen-cmd "bin/ytops-client get-info {url}" -f "18" --sleep 300 + +# Run the test for exactly 10 cycles, continuing on failure +ytops-client stress-formats --info-json-files my_video.json -f 18 --sleep 10 --max-attempts 10 --no-stop-on-failure +""" + ) + source_group = parser.add_mutually_exclusive_group(required=True) + source_group.add_argument('--info-json-files', help='Comma-separated paths to static info.json files to use for testing.') + source_group.add_argument('--urls-file', help='Path to a file with URLs/IDs to test. Can be a text file (one per line) or a JSON array of strings.') + + parser.add_argument('-f', '--format', help='The format selection string. Can be a comma-separated list of IDs (e.g., "18,137"), "all", "random:X%%" (e.g., "random:10%%"), or "random_from:ID1,ID2,..." to pick one from a list. Required unless --fetch-only is used.') + parser.add_argument('--sleep', type=int, default=60, help='Seconds to wait between batches of download attempts. Default: 60.') + parser.add_argument('--sleep-formats', type=int, default=0, help='Seconds to wait between format downloads within a single file/cycle. Default: 0.') + parser.add_argument('--max-attempts', type=int, default=0, help='Maximum number of test cycles. 0 means run indefinitely. Default: 0.') + parser.add_argument('--duration', type=int, default=0, help='Total duration to run the test in minutes. 0 means run indefinitely (or until max-attempts is reached). Default: 0.') + parser.add_argument('--stop-on-failure', action='store_true', help='Stop the test immediately after the first download failure.') + parser.add_argument('--no-stop-on-failure', dest='stop_on_failure', action='store_false', help='Continue testing even after a download failure. (Default)') + parser.set_defaults(stop_on_failure=False) + parser.add_argument('--stop-on-403', action='store_true', help='Stop the test immediately after a 403 Forbidden error.') + parser.add_argument('--stop-on-timeout', action='store_true', help='Stop the test immediately after a read timeout error.') + + parser.add_argument('--fetch-only', action='store_true', help='When used with --urls-file, only fetch and save info.json files without performing download tests.') + + parser.add_argument('--workers', type=int, default=1, help='Number of parallel workers for multi-file mode. Default: 1.') + parser.add_argument('--stats-file', default='stress_test_stats.jsonl', help='File to log statistics for each attempt. Default: stress_test_stats.jsonl') + parser.add_argument('--stats-interval', type=int, default=0, help='Interval in seconds to print stats summary periodically. 0 disables. Default: 0.') + + # Arguments for info.json generation + parser.add_argument('--info-json-gen-cmd', help='Command template to generate info.json. Use {url}, {worker_id}, {cycle}, and {profile} as placeholders. Required with --urls-file.') + parser.add_argument('--info-json-gen-cmd-alt', help='Alternate command template for info.json generation.') + parser.add_argument('--alt-cmd-every-n', type=int, default=0, help='Use the alternate command for every N-th URL (e.g., N=3 means URLs 3, 6, 9...). Requires --info-json-gen-cmd-alt.') + + # Profile generation options + profile_group = parser.add_argument_group('Profile Generation Options (for --urls-file mode)') + profile_group.add_argument('--profile-prefix', help='Base name for generated profile IDs (e.g., "test_user"). Used with --profile-pool or --profile-per-request.') + profile_group.add_argument('--profile-pool', type=int, metavar='N', help='Use a pool of N profiles. Profile ID will be {prefix}_{worker_id %% N}. Requires --profile-prefix.') + profile_group.add_argument('--profile-per-request', action='store_true', help='Generate a new unique profile ID for each request. Profile ID will be {prefix}_{timestamp}_{worker_id}. Requires --profile-prefix.') + + # Arguments to pass to format_download.py + parser.add_argument('--format-download-args', nargs='+', help='Additional arguments to pass to the download tool. E.g., --proxy-rename s/old/new/ --cleanup') + + parser.add_argument('--verbose', action='store_true', help='Enable verbose output.') + return parser + +def run_command(cmd, input_data=None): + """Runs a command, captures its output, and returns status.""" + logger.debug(f"Running command: {' '.join(cmd)}") + try: + process = subprocess.Popen( + cmd, + stdin=subprocess.PIPE if input_data else None, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding='utf-8' + ) + stdout, stderr = process.communicate(input=input_data) + return process.returncode, stdout, stderr + except FileNotFoundError: + logger.error(f"Command not found: {cmd[0]}. Make sure it's in your PATH.") + return -1, "", f"Command not found: {cmd[0]}" + except Exception as e: + logger.error(f"An error occurred while running command: {' '.join(cmd)}. Error: {e}") + return -1, "", str(e) + +def run_download_worker(info_json_path, info_json_content, format_to_download, args): + """ + Performs a single download attempt. Designed to be run in a worker thread. + """ + # 1. Attempt download + download_cmd = [ + sys.executable, '-m', 'ytops_client.cli', 'download', + '-f', format_to_download + ] + if args.format_download_args: + # with nargs='+', this is a list. + # If it's one item, it might be a single quoted string of args that needs splitting. + if len(args.format_download_args) == 1: + download_cmd.extend(shlex.split(args.format_download_args[0])) + else: + # multiple items, assume they are already split by shell + download_cmd.extend(args.format_download_args) + + display_name = get_display_name(info_json_path) + logger.info(f"[{display_name} @ {format_to_download}] Kicking off download process...") + retcode, stdout, stderr = run_command(download_cmd, input_data=info_json_content) + + # 2. Check result + is_403_error = "HTTP Error 403" in stderr + is_timeout_error = "Read timed out" in stderr + + result = { + 'type': 'download', + 'path': str(info_json_path), + 'format': format_to_download, + 'success': retcode == 0, + 'error_type': None, + 'details': '', + 'downloaded_bytes': 0 + } + + if retcode == 0: + # Success + downloaded_filepath = '' + # The filename is the last non-empty line of stdout that doesn't look like a progress bar + lines = stdout.splitlines() + for line in reversed(lines): + if line and not line.strip().startswith('['): + downloaded_filepath = line.strip() + break + + details_str = "OK" + if downloaded_filepath: + details_str = f"Downloaded: {Path(downloaded_filepath).name}" + + # Parse download size from stderr + size_in_bytes = 0 + size_match = re.search(r'\[download\]\s+100%\s+of\s+~?([0-9.]+)(B|KiB|MiB|GiB)', stderr) + if size_match: + value = float(size_match.group(1)) + unit = size_match.group(2) + multipliers = {"B": 1, "KiB": 1024, "MiB": 1024**2, "GiB": 1024**3} + size_in_bytes = int(value * multipliers.get(unit, 1)) + result['downloaded_bytes'] = size_in_bytes + details_str += f" ({size_match.group(1)}{unit})" + + result['details'] = details_str + else: + # Failure + # Try to get the most relevant error line + error_lines = [line for line in stderr.strip().split('\n') if 'ERROR:' in line] + if error_lines: + result['details'] = error_lines[-1] + else: + # If no "ERROR:" line, use the last few lines of stderr for context. + last_lines = stderr.strip().split('\n')[-3:] # Get up to last 3 lines + result['details'] = ' | '.join(line.strip() for line in last_lines if line.strip()) + if not result['details']: + result['details'] = "Unknown error (stderr was empty)" + + if is_403_error: + result['error_type'] = 'HTTP 403' + elif is_timeout_error: + result['error_type'] = 'Timeout' + else: + result['error_type'] = f'Exit Code {retcode}' + + return result + + +def process_info_json_cycle(path, content, args, stats): + """ + Processes one info.json file for one cycle, downloading selected formats sequentially. + Logs events and returns a list of results. + """ + results = [] + should_stop_file = False + display_name = get_display_name(path) + + # Determine formats to test based on the info.json content + try: + info_data = json.loads(content) + available_formats = info_data.get('formats', []) + if not available_formats: + logger.warning(f"[{display_name}] No formats found in info.json. Skipping.") + return [] + + available_format_ids = [f['format_id'] for f in available_formats] + + formats_to_test = [] + format_selection_mode = args.format.lower() + + if format_selection_mode == 'all': + formats_to_test = available_format_ids + logger.info(f"[{display_name}] Testing all {len(formats_to_test)} available formats.") + elif format_selection_mode.startswith('random:'): + try: + percent_str = format_selection_mode.split(':')[1].rstrip('%') + percent = float(percent_str) + if not (0 < percent <= 100): + raise ValueError("Percentage must be between 0 and 100.") + + count = max(1, int(len(available_format_ids) * (percent / 100.0))) + formats_to_test = random.sample(available_format_ids, k=count) + logger.info(f"[{display_name}] Randomly selected {len(formats_to_test)} formats ({percent}%) from all available to test: {', '.join(formats_to_test)}") + except (ValueError, IndexError) as e: + logger.error(f"[{display_name}] Invalid random format selection '{args.format}': {e}. Skipping.") + return [] + elif format_selection_mode.startswith('random_from:'): + try: + choices_str = format_selection_mode.split(':', 1)[1] + if not choices_str: + raise ValueError("No formats provided after 'random_from:'.") + + format_choices = [f.strip() for f in choices_str.split(',') if f.strip()] + + # Filter the choices to only those available in the current info.json + valid_choices = [f for f in format_choices if f in available_format_ids] + + if not valid_choices: + logger.warning(f"[{display_name}] None of the requested formats for random selection ({', '.join(format_choices)}) are available. Skipping.") + return [] + + formats_to_test = [random.choice(valid_choices)] + logger.info(f"[{display_name}] Randomly selected 1 format from your list to test: {formats_to_test[0]}") + except (ValueError, IndexError) as e: + logger.error(f"[{display_name}] Invalid random_from format selection '{args.format}': {e}. Skipping.") + return [] + else: + # Standard comma-separated list + requested_formats = [f.strip() for f in args.format.split(',') if f.strip()] + formats_to_test = [] + for req_fmt in requested_formats: + # Check for exact match first + if req_fmt in available_format_ids: + formats_to_test.append(req_fmt) + continue + + # If no exact match, check for formats that start with this ID + '-' + # e.g., req_fmt '140' should match '140-0' + prefix_match = f"{req_fmt}-" + first_match = next((af for af in available_format_ids if af.startswith(prefix_match)), None) + + if first_match: + logger.info(f"[{display_name}] Requested format '{req_fmt}' not found. Using first available match: '{first_match}'.") + formats_to_test.append(first_match) + else: + # This could be a complex selector like 'bestvideo' or '299/298', so keep it. + if req_fmt not in available_format_ids: + logger.warning(f"[{display_name}] Requested format '{req_fmt}' not found in available formats.") + formats_to_test.append(req_fmt) + + except json.JSONDecodeError: + logger.error(f"[{display_name}] Failed to parse info.json. Skipping.") + return [] + + for i, format_id in enumerate(formats_to_test): + if should_stop_file: + break + + # Check if the format URL is expired before attempting to download + format_details = next((f for f in available_formats if f.get('format_id') == format_id), None) + if format_details and 'url' in format_details: + parsed_url = urlparse(format_details['url']) + query_params = parse_qs(parsed_url.query) + expire_ts_str = query_params.get('expire', [None])[0] + if expire_ts_str and expire_ts_str.isdigit(): + expire_ts = int(expire_ts_str) + if expire_ts < time.time(): + logger.warning(f"[{display_name}] Skipping format '{format_id}' because its URL is expired.") + result = { + 'type': 'download', 'path': str(path), 'format': format_id, + 'success': True, 'error_type': 'Skipped', + 'details': 'Download URL is expired', 'downloaded_bytes': 0 + } + stats.log_event(result) + results.append(result) + continue # Move to the next format + + result = run_download_worker(path, content, format_id, args) + stats.log_event(result) + results.append(result) + + status = "SUCCESS" if result['success'] else f"FAILURE ({result['error_type']})" + logger.info(f"Result for {display_name} (format {format_id}): {status} - {result.get('details', 'OK')}") + + if not result['success']: + # This flag stops processing more formats for THIS file in this cycle + # The main loop will decide if all cycles should stop. + if args.stop_on_failure or \ + (args.stop_on_403 and result['error_type'] == 'HTTP 403') or \ + (args.stop_on_timeout and result['error_type'] == 'Timeout'): + logger.info(f"Stopping further format tests for {display_name} in this cycle due to failure.") + should_stop_file = True + + # Sleep between formats if needed + if args.sleep_formats > 0 and i < len(formats_to_test) - 1: + logger.info(f"Sleeping for {args.sleep_formats}s before next format for {display_name}...") + time.sleep(args.sleep_formats) + + return results + + +def main_stress_formats(args): + """Main logic for the 'stress-formats' command.""" + # The --format argument is required unless we are only fetching info.json files. + if not args.fetch_only and not args.format: + logger.error("Error: argument -f/--format is required when not using --fetch-only.") + return 1 + + if (args.profile_pool or args.profile_per_request) and not args.profile_prefix: + logger.error("--profile-prefix is required when using --profile-pool or --profile-per-request.") + return 1 + + if args.urls_file and args.fetch_only and not args.info_json_gen_cmd: + logger.error("--info-json-gen-cmd is required when using --urls-file with --fetch-only.") + return 1 + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + else: + # Make the default logger more concise for test output + for handler in logging.root.handlers: + handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s', datefmt='%H:%M:%S')) + + stats = StatsTracker(args.stats_file) + start_time = time.time() + duration_seconds = args.duration * 60 if args.duration > 0 else 0 + + # --- Load sources --- + info_jsons = {} + urls = [] + if args.info_json_files: + info_json_files = [Path(p.strip()) for p in args.info_json_files.split(',')] + for file_path in info_json_files: + if not file_path.is_file(): + logger.error(f"Info.json file not found: {file_path}") + continue + try: + with open(file_path, 'r', encoding='utf-8') as f: + info_jsons[file_path] = f.read() + except (IOError, json.JSONDecodeError) as e: + logger.error(f"Failed to read or parse {file_path}: {e}") + + if not info_jsons: + logger.error("No valid info.json files to process. Exiting.") + return 1 + logger.info(f"Loaded {len(info_jsons)} info.json file(s).") + print_banner(args, info_jsons=info_jsons) + + elif args.urls_file: + if not args.info_json_gen_cmd: + logger.error("--info-json-gen-cmd is required when using --urls-file.") + return 1 + try: + with open(args.urls_file, 'r', encoding='utf-8') as f: + content = f.read() + # Try parsing as JSON array first + try: + data = json.loads(content) + if isinstance(data, list) and all(isinstance(item, str) for item in data): + urls = data + logger.info(f"Loaded {len(urls)} URLs/IDs from JSON array in {args.urls_file}.") + else: + # Valid JSON, but not a list of strings. Treat as error to avoid confusion. + logger.error(f"URL file '{args.urls_file}' is valid JSON but not an array of strings.") + return 1 + except json.JSONDecodeError: + # Fallback to line-by-line parsing for plain text files + urls = [line.strip() for line in content.splitlines() if line.strip()] + logger.info(f"Loaded {len(urls)} URLs/IDs from text file {args.urls_file}.") + + if not urls: + logger.error(f"URL file '{args.urls_file}' is empty or contains no valid URLs/IDs.") + return 1 + except IOError as e: + logger.error(f"Failed to read URL file {args.urls_file}: {e}") + return 1 + + # Clean up URLs/IDs which might have extra quotes, commas, or brackets from copy-pasting + cleaned_urls = [] + for url in urls: + # Strip whitespace, then trailing comma, then surrounding junk, then whitespace again + cleaned_url = url.strip().rstrip(',').strip().strip('\'"[]').strip() + if cleaned_url: + cleaned_urls.append(cleaned_url) + + if len(cleaned_urls) != len(urls): + logger.info(f"Cleaned URL list, removed {len(urls) - len(cleaned_urls)} empty or invalid entries.") + + urls = cleaned_urls + if not urls: + logger.error("URL list is empty after cleaning. Exiting.") + return 1 + + print_banner(args, urls=urls) + + # --- Main test loop --- + cycles = 0 + last_stats_print_time = time.time() + try: + # --- Worker function for URL mode --- + def process_url_task(url, url_index, cycle_num): + """Worker to generate info.json for a URL and then test formats.""" + # 1. Generate profile name if configured + profile_name = None + if args.profile_prefix: + if args.profile_pool: + profile_name = f"{args.profile_prefix}_{url_index % args.profile_pool}" + elif args.profile_per_request: + timestamp = datetime.now().strftime('%Y%m%d%H%M%S') + profile_name = f"{args.profile_prefix}_{timestamp}_{url_index}" + + # 2. Select and format the generation command + gen_cmd_template = args.info_json_gen_cmd + if args.alt_cmd_every_n > 0 and args.info_json_gen_cmd_alt and (url_index + 1) % args.alt_cmd_every_n == 0: + gen_cmd_template = args.info_json_gen_cmd_alt + logger.info(f"Using alternate command for URL #{url_index + 1}: {url}") + + try: + # shlex.split handles quoted arguments in the template + video_id = get_video_id(url) + gen_cmd = [] + template_args = shlex.split(gen_cmd_template) + + # If the video ID could be mistaken for an option, and it appears to be + # a positional argument, insert '--' to prevent misinterpretation. + if video_id.startswith('-'): + try: + # Heuristic: if {url} is the last token, it's likely positional. + if template_args and template_args[-1] == '{url}': + template_args.insert(-1, '--') + except (ValueError, IndexError): + pass # {url} not found or list is empty. + + for arg in template_args: + # Replace placeholders + formatted_arg = arg.replace('{url}', video_id) \ + .replace('{worker_id}', str(url_index)) \ + .replace('{cycle}', str(cycle_num)) + if profile_name: + formatted_arg = formatted_arg.replace('{profile}', profile_name) + gen_cmd.append(formatted_arg) + + # Pass verbose flag through if set + if args.verbose and 'get_info_json_client.py' in gen_cmd_template and '--verbose' not in gen_cmd_template: + gen_cmd.append('--verbose') + + except Exception as e: + logger.error(f"Failed to format --info-json-gen-cmd: {e}") + stats.log_event({'path': url, 'success': False, 'error_type': 'BadGenCmd', 'details': 'Cmd format error'}) + return [] + + # 3. Run command to get info.json + log_msg = f"[{url}] Generating info.json" + if profile_name: + log_msg += f" with profile '{profile_name}'" + log_msg += "..." + logger.info(log_msg) + + retcode, stdout, stderr = run_command(gen_cmd) + if retcode != 0: + error_msg = stderr.strip().split('\n')[-1] + logger.error(f"[{url}] Failed to generate info.json: {error_msg}") + event = {'type': 'fetch', 'path': url, 'success': False, 'error_type': 'GetInfoJsonFail', 'details': error_msg} + stats.log_event(event) + return [] # Return empty list, as no formats were tested + + # Handle --fetch-only + if args.fetch_only: + logger.info(f"[{url}] Successfully fetched info.json. Skipping download due to --fetch-only.") + event = {'type': 'fetch', 'path': url, 'success': True, 'details': 'OK'} + stats.log_event(event) + return [] # Return empty list, indicating no downloads to check for failure + + # 4. Pass to the format processing function + return process_info_json_cycle(url, stdout, args, stats) + + while True: + if duration_seconds and (time.time() - start_time) > duration_seconds: + logger.info(f"Reached duration limit of {args.duration} minutes. Stopping.") + break + cycles += 1 + if args.max_attempts > 0 and cycles > args.max_attempts: + logger.info(f"Reached max cycles ({args.max_attempts}). Stopping.") + break + + logger.info(f"--- Cycle #{cycles} ---") + + with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor: + future_to_identifier = {} + if args.info_json_files: + future_to_identifier = { + executor.submit(process_info_json_cycle, path, content, args, stats): path + for path, content in info_jsons.items() + } + elif args.urls_file: + future_to_identifier = { + executor.submit(process_url_task, url, i, cycles): url + for i, url in enumerate(urls) + } + + should_stop = False + + # Use a set of futures that we can modify while iterating + futures = set(future_to_identifier.keys()) + + while futures and not should_stop: + # Wait for the next future to complete + done, futures = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED) + + for future in done: + identifier = future_to_identifier[future] + identifier_name = get_display_name(identifier) + try: + results = future.result() + # Check if any result from this file triggers a global stop + for result in results: + if not result['success']: + if args.stop_on_failure: + logger.info(f"Failure on {identifier_name} (format {result['format']}). Shutting down all workers due to --stop-on-failure.") + should_stop = True + elif args.stop_on_403 and result['error_type'] == 'HTTP 403': + logger.info(f"403 error on {identifier_name} (format {result['format']}). Shutting down all workers due to --stop-on-403.") + should_stop = True + elif args.stop_on_timeout and result['error_type'] == 'Timeout': + logger.info(f"Timeout on {identifier_name} (format {result['format']}). Shutting down all workers due to --stop-on-timeout.") + should_stop = True + except Exception as exc: + logger.error(f'{identifier_name} generated an exception: {exc}') + stats.log_event({'path': str(identifier), 'success': False, 'error_type': 'Exception', 'details': str(exc)}) + + if should_stop: + break # Stop processing results from 'done' set + + # Check for duration limit after each batch of tasks completes + if duration_seconds and (time.time() - start_time) > duration_seconds: + logger.info(f"Reached duration limit of {args.duration} minutes. Cancelling remaining tasks.") + should_stop = True + + # If the loop was exited, cancel any remaining tasks + if should_stop and futures: + logger.info(f"Cancelling {len(futures)} outstanding task(s).") + for future in futures: + future.cancel() + + if should_stop: + break + + if args.stats_interval > 0 and (time.time() - last_stats_print_time) >= args.stats_interval: + stats.print_summary() + last_stats_print_time = time.time() + + if args.max_attempts > 0 and cycles >= args.max_attempts: + break + + logger.info(f"Cycle complete. Sleeping for {args.sleep} seconds...") + + # Interruptible sleep that respects the total test duration + sleep_end_time = time.time() + args.sleep + should_stop_after_sleep = False + while time.time() < sleep_end_time: + if duration_seconds and (time.time() - start_time) >= duration_seconds: + logger.info(f"Reached duration limit of {args.duration} minutes during sleep. Stopping.") + should_stop_after_sleep = True + break + time.sleep(1) # Check every second + + if should_stop_after_sleep: + break + + except KeyboardInterrupt: + logger.info("\nCtrl+C received, shutting down...") + finally: + stats.print_summary() + stats.close() + + return 0 if not any(not e['success'] for e in stats.events) else 1 diff --git a/ytops_client/stress_policy_tool.py b/ytops_client/stress_policy_tool.py new file mode 100644 index 0000000..4c2cd0a --- /dev/null +++ b/ytops_client/stress_policy_tool.py @@ -0,0 +1,2420 @@ +#!/usr/bin/env python3 +""" +Policy-driven stress-testing orchestrator for video format downloads. +""" + +import argparse +import collections +import collections.abc +import concurrent.futures +import json +import logging +import os +import random +import re +import shlex +import signal +import subprocess +import sys +import threading +import time +from copy import deepcopy +from datetime import datetime, timezone +from pathlib import Path +from urllib.parse import urlparse, parse_qs + +try: + import yaml +except ImportError: + print("PyYAML is not installed. Please install it with: pip install PyYAML", file=sys.stderr) + sys.exit(1) + +# Add a global event for graceful shutdown +shutdown_event = threading.Event() + +# Globals for tracking and terminating subprocesses on shutdown +running_processes = set() +process_lock = threading.Lock() + +# Globals for assigning a stable ID to each worker thread +worker_id_map = {} +worker_id_counter = 0 +worker_id_lock = threading.Lock() + +# Configure logging +logger = logging.getLogger('stress_policy_tool') + + +def get_worker_id(): + """Assigns a stable, sequential ID to each worker thread.""" + global worker_id_counter + thread_id = threading.get_ident() + with worker_id_lock: + if thread_id not in worker_id_map: + worker_id_map[thread_id] = worker_id_counter + worker_id_counter += 1 + return worker_id_map[thread_id] + + +def get_video_id(url: str) -> str: + """Extracts a YouTube video ID from a URL.""" + match = re.search(r"v=([0-9A-Za-z_-]{11})", url) + if match: + return match.group(1) + match = re.search(r"youtu\.be\/([0-9A-Za-z_-]{11})", url) + if match: + return match.group(1) + if re.fullmatch(r'[0-9A-Za-z_-]{11}', url): + return url + return "unknown_video_id" + + +def get_display_name(path_or_url): + """Returns a clean name for logging, either a filename or a video ID.""" + if isinstance(path_or_url, Path): + return path_or_url.name + + path_str = str(path_or_url) + video_id = get_video_id(path_str) + if video_id != "unknown_video_id": + return video_id + + return Path(path_str).name + + +def format_size(b): + """Format size in bytes to human-readable string.""" + if b is None: + return 'N/A' + if b < 1024: + return f"{b}B" + elif b < 1024**2: + return f"{b/1024:.2f}KiB" + elif b < 1024**3: + return f"{b/1024**2:.2f}MiB" + else: + return f"{b/1024**3:.2f}GiB" + + +def flatten_dict(d, parent_key='', sep='.'): + """Flattens a nested dictionary.""" + items = {} + for k, v in d.items(): + new_key = parent_key + sep + k if parent_key else k + if isinstance(v, collections.abc.MutableMapping): + items.update(flatten_dict(v, new_key, sep=sep)) + else: + items[new_key] = v + return items + + +def print_policy_overrides(policy): + """Prints all policy values as a single-line of --set arguments.""" + # We don't want to include the 'name' key in the overrides. + policy_copy = deepcopy(policy) + policy_copy.pop('name', None) + + flat_policy = flatten_dict(policy_copy) + + set_args = [] + for key, value in sorted(flat_policy.items()): + if value is None: + value_str = 'null' + elif isinstance(value, bool): + value_str = str(value).lower() + elif isinstance(value, (list, dict)): + # Use compact JSON for lists/dicts + value_str = json.dumps(value, separators=(',', ':')) + else: + value_str = str(value) + + # Use shlex.quote to handle spaces and special characters safely + set_args.append(f"--set {shlex.quote(f'{key}={value_str}')}") + + print(' '.join(set_args)) + + +def get_profile_from_filename(path, regex_pattern): + """Extracts a profile name from a filename using a regex.""" + if not regex_pattern: + return None + match = re.search(regex_pattern, path.name) + if match: + # Assume the first capturing group is the profile name + if match.groups(): + return match.group(1) + return None + + +class StateManager: + """Tracks statistics, manages rate limits, and persists state across runs.""" + def __init__(self, policy_name): + self.state_file_path = Path(f"{policy_name}_state.json") + self.stats_file_path = Path(f"{policy_name}_stats.jsonl") + self.lock = threading.RLock() + self.start_time = time.time() + self.events = [] + self.state = { + 'global_request_count': 0, + 'rate_limit_trackers': {}, # e.g., {'per_ip': [ts1, ts2], 'profile_foo': [ts3, ts4]} + 'profile_request_counts': {}, # for client rotation + 'profile_last_refresh_time': {}, # for client rotation + 'proxy_last_finish_time': {}, # for per-proxy sleep + 'processed_files': [], # For continuous download_only mode + # For dynamic profile cooldown strategy + 'profile_cooldown_counts': {}, + 'profile_cooldown_sleep_until': {}, + 'profile_pool_size': 0, + 'profile_run_suffix': None, + 'worker_profile_generations': {} + } + self.stats_file_handle = None + self._load_state() + self.print_historical_summary() + self._open_stats_log() + + def _load_state(self): + if not self.state_file_path.exists(): + logger.info(f"State file not found at '{self.state_file_path}', starting fresh.") + return + try: + with open(self.state_file_path, 'r', encoding='utf-8') as f: + self.state = json.load(f) + # Ensure keys exist + self.state.setdefault('global_request_count', 0) + self.state.setdefault('rate_limit_trackers', {}) + self.state.setdefault('profile_request_counts', {}) + self.state.setdefault('profile_last_refresh_time', {}) + self.state.setdefault('proxy_last_finish_time', {}) + self.state.setdefault('processed_files', []) + # For dynamic profile cooldown strategy + self.state.setdefault('profile_cooldown_counts', {}) + self.state.setdefault('profile_cooldown_sleep_until', {}) + self.state.setdefault('profile_pool_size', 0) + self.state.setdefault('profile_run_suffix', None) + self.state.setdefault('worker_profile_generations', {}) + logger.info(f"Loaded state from {self.state_file_path}") + except (IOError, json.JSONDecodeError) as e: + logger.error(f"Could not load or parse state file {self.state_file_path}: {e}. Starting fresh.") + + def _save_state(self): + with self.lock: + try: + with open(self.state_file_path, 'w', encoding='utf-8') as f: + json.dump(self.state, f, indent=2) + logger.info(f"Saved state to {self.state_file_path}") + except IOError as e: + logger.error(f"Could not save state to {self.state_file_path}: {e}") + + def _open_stats_log(self): + try: + self.stats_file_handle = open(self.stats_file_path, 'a', encoding='utf-8') + except IOError as e: + logger.error(f"Could not open stats file {self.stats_file_path}: {e}") + + def close(self): + """Saves state and closes file handles.""" + self._save_state() + if self.stats_file_handle: + self.stats_file_handle.close() + self.stats_file_handle = None + + def mark_file_as_processed(self, file_path): + """Adds a file path to the list of processed files in the state.""" + with self.lock: + # Using a list and checking for existence is fine for moderate numbers of files. + # A set isn't JSON serializable. + processed = self.state.setdefault('processed_files', []) + file_str = str(file_path) + if file_str not in processed: + processed.append(file_str) + + def get_processed_files(self): + """Returns a set of file paths that have been processed.""" + with self.lock: + return set(self.state.get('processed_files', [])) + + def print_historical_summary(self): + """Prints a summary based on the state loaded from disk, before new events.""" + with self.lock: + now = time.time() + rate_trackers = self.state.get('rate_limit_trackers', {}) + total_requests = self.state.get('global_request_count', 0) + + if not rate_trackers and not total_requests: + logger.info("No historical data found in state file.") + return + + logger.info("\n--- Summary From Previous Runs ---") + logger.info(f"Total info.json requests (all previous runs): {total_requests}") + + if rate_trackers: + for key, timestamps in sorted(rate_trackers.items()): + # Time windows in seconds + windows = { + 'last 10 min': 600, + 'last 60 min': 3600, + 'last 6 hours': 21600, + 'last 24 hours': 86400 + } + + rates_str_parts = [] + for name, seconds in windows.items(): + count = sum(1 for ts in timestamps if now - ts <= seconds) + # Calculate rate in requests per minute + rate_rpm = (count / seconds) * 60 if seconds > 0 else 0 + rates_str_parts.append(f"{count} req in {name} ({rate_rpm:.2f} rpm)") + + logger.info(f"Tracker '{key}': " + ", ".join(rates_str_parts)) + logger.info("------------------------------------") + + def log_event(self, event_data): + with self.lock: + event_data['timestamp'] = datetime.now().isoformat() + self.events.append(event_data) + if self.stats_file_handle: + self.stats_file_handle.write(json.dumps(event_data) + '\n') + self.stats_file_handle.flush() + + def get_request_count(self): + with self.lock: + return self.state.get('global_request_count', 0) + + def increment_request_count(self): + with self.lock: + self.state['global_request_count'] = self.state.get('global_request_count', 0) + 1 + + def check_cumulative_error_rate(self, max_errors, per_minutes, error_type=None): + """ + Checks if a cumulative error rate has been exceeded. + If error_type is None, checks for any failure. + Returns the number of errors found if the threshold is met, otherwise 0. + """ + with self.lock: + now = time.time() + window_seconds = per_minutes * 60 + + if error_type: + recent_errors = [ + e for e in self.events + if e.get('error_type') == error_type and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds + ] + else: # Generic failure check + recent_errors = [ + e for e in self.events + if not e.get('success') and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds + ] + + if len(recent_errors) >= max_errors: + return len(recent_errors) + return 0 + + def check_quality_degradation_rate(self, max_triggers, per_minutes): + """ + Checks if the quality degradation trigger rate has been exceeded. + Returns the number of triggers found if the threshold is met, otherwise 0. + """ + with self.lock: + now = time.time() + window_seconds = per_minutes * 60 + + recent_triggers = [ + e for e in self.events + if e.get('quality_degradation_trigger') and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds + ] + + if len(recent_triggers) >= max_triggers: + return len(recent_triggers) + return 0 + + def check_and_update_rate_limit(self, profile_name, policy): + """ + Checks if a request is allowed based on policy rate limits. + If allowed, updates the internal state. Returns True if allowed, False otherwise. + """ + with self.lock: + now = time.time() + gen_policy = policy.get('info_json_generation_policy', {}) + rate_limits = gen_policy.get('rate_limits', {}) + + # Check per-IP limit + ip_limit = rate_limits.get('per_ip') + if ip_limit: + tracker_key = 'per_ip' + max_req = ip_limit.get('max_requests') + period_min = ip_limit.get('per_minutes') + if max_req and period_min: + timestamps = self.state['rate_limit_trackers'].get(tracker_key, []) + # Filter out old timestamps + timestamps = [ts for ts in timestamps if now - ts < period_min * 60] + if len(timestamps) >= max_req: + logger.warning("Per-IP rate limit reached. Skipping task.") + return False + self.state['rate_limit_trackers'][tracker_key] = timestamps + + # Check per-profile limit + profile_limit = rate_limits.get('per_profile') + if profile_limit and profile_name: + tracker_key = f"profile_{profile_name}" + max_req = profile_limit.get('max_requests') + period_min = profile_limit.get('per_minutes') + if max_req and period_min: + timestamps = self.state['rate_limit_trackers'].get(tracker_key, []) + timestamps = [ts for ts in timestamps if now - ts < period_min * 60] + if len(timestamps) >= max_req: + logger.warning(f"Per-profile rate limit for '{profile_name}' reached. Skipping task.") + return False + self.state['rate_limit_trackers'][tracker_key] = timestamps + + # If all checks pass, record the new request timestamp for all relevant trackers + if ip_limit and ip_limit.get('max_requests'): + self.state['rate_limit_trackers'].setdefault('per_ip', []).append(now) + if profile_limit and profile_limit.get('max_requests') and profile_name: + self.state['rate_limit_trackers'].setdefault(f"profile_{profile_name}", []).append(now) + + return True + + def get_client_for_request(self, profile_name, gen_policy): + """ + Determines which client to use based on the client_rotation_policy. + Returns a tuple: (client_name, request_params_dict). + """ + with self.lock: + rotation_policy = gen_policy.get('client_rotation_policy') + + # If no rotation policy, use the simple 'client' key. + if not rotation_policy: + client = gen_policy.get('client') + logger.info(f"Using client '{client}' for profile '{profile_name}'.") + req_params = gen_policy.get('request_params') + return client, req_params + + # --- Rotation logic --- + now = time.time() + major_client = rotation_policy.get('major_client') + refresh_client = rotation_policy.get('refresh_client') + refresh_every = rotation_policy.get('refresh_every', {}) + + if not refresh_client or not refresh_every: + return major_client, rotation_policy.get('major_client_params') + + should_refresh = False + + # Check time-based refresh + refresh_minutes = refresh_every.get('minutes') + last_refresh_time = self.state['profile_last_refresh_time'].get(profile_name, 0) + if refresh_minutes and (now - last_refresh_time) > (refresh_minutes * 60): + should_refresh = True + + # Check request-count-based refresh + refresh_requests = refresh_every.get('requests') + request_count = self.state['profile_request_counts'].get(profile_name, 0) + if refresh_requests and request_count >= refresh_requests: + should_refresh = True + + if should_refresh: + logger.info(f"Profile '{profile_name}' is due for a refresh. Using refresh client '{refresh_client}'.") + self.state['profile_last_refresh_time'][profile_name] = now + self.state['profile_request_counts'][profile_name] = 0 # Reset counter + return refresh_client, rotation_policy.get('refresh_client_params') + else: + # Not refreshing, so increment request count for this profile + self.state['profile_request_counts'][profile_name] = request_count + 1 + return major_client, rotation_policy.get('major_client_params') + + def get_next_available_profile(self, policy): + """ + Finds or creates an available profile based on the dynamic cooldown policy. + Returns a profile name, or None if no profile is available. + """ + with self.lock: + now = time.time() + settings = policy.get('settings', {}) + pm_policy = settings.get('profile_management') + + if not pm_policy: + return None + + prefix = pm_policy.get('prefix') + if not prefix: + logger.error("Profile management policy requires 'prefix'.") + return None + + # Determine and persist the suffix for this run to ensure profile names are stable + run_suffix = self.state.get('profile_run_suffix') + if not run_suffix: + suffix_config = pm_policy.get('suffix') + if suffix_config == 'auto': + run_suffix = datetime.now().strftime('%Y%m%d%H%M') + else: + run_suffix = suffix_config or '' + self.state['profile_run_suffix'] = run_suffix + + # Initialize pool size from policy if not already in state + if self.state.get('profile_pool_size', 0) == 0: + self.state['profile_pool_size'] = pm_policy.get('initial_pool_size', 1) + + max_reqs = pm_policy.get('max_requests_per_profile') + sleep_mins = pm_policy.get('sleep_minutes_on_exhaustion') + + # Loop until a profile is found or we decide we can't find one + while True: + # Try to find an existing, available profile + for i in range(self.state['profile_pool_size']): + profile_name = f"{prefix}_{run_suffix}_{i}" if run_suffix else f"{prefix}_{i}" + + # Check if sleeping + sleep_until = self.state['profile_cooldown_sleep_until'].get(profile_name, 0) + if now < sleep_until: + continue # Still sleeping + + # Check if it needs to be put to sleep + req_count = self.state['profile_cooldown_counts'].get(profile_name, 0) + if max_reqs and req_count >= max_reqs: + sleep_duration_seconds = (sleep_mins or 0) * 60 + self.state['profile_cooldown_sleep_until'][profile_name] = now + sleep_duration_seconds + self.state['profile_cooldown_counts'][profile_name] = 0 # Reset count for next time + logger.info(f"Profile '{profile_name}' reached request limit ({req_count}/{max_reqs}). Putting to sleep for {sleep_mins} minutes.") + continue # Now sleeping, try next profile + + # This profile is available + logger.info(f"Selected available profile '{profile_name}' (request count: {req_count}/{max_reqs if max_reqs else 'unlimited'}).") + return profile_name + + # If we get here, no existing profile was available + if pm_policy.get('auto_expand_pool'): + new_profile_index = self.state['profile_pool_size'] + self.state['profile_pool_size'] += 1 + profile_name = f"{prefix}_{run_suffix}_{new_profile_index}" if run_suffix else f"{prefix}_{new_profile_index}" + logger.info(f"Profile pool exhausted. Expanding pool to size {self.state['profile_pool_size']}. New profile: '{profile_name}'") + return profile_name + else: + # No available profiles and pool expansion is disabled + return None + + def get_or_rotate_worker_profile(self, worker_id, policy): + """ + Gets the current profile for a worker, rotating to a new generation if the lifetime limit is met. + This is used by the 'per_worker_with_rotation' profile mode. + """ + with self.lock: + pm_policy = policy.get('settings', {}).get('profile_management', {}) + if not pm_policy: + logger.error("Profile mode 'per_worker_with_rotation' requires 'settings.profile_management' configuration in the policy.") + return f"error_profile_{worker_id}" + + prefix = pm_policy.get('prefix') + if not prefix: + logger.error("Profile management for 'per_worker_with_rotation' requires a 'prefix'.") + return f"error_profile_{worker_id}" + + max_reqs = pm_policy.get('max_requests_per_profile') + + generations = self.state.setdefault('worker_profile_generations', {}) + # worker_id is an int, but JSON keys must be strings + worker_id_str = str(worker_id) + current_gen = generations.get(worker_id_str, 0) + + profile_name = f"{prefix}_{worker_id}_{current_gen}" + + if not max_reqs: # No lifetime limit defined, so never rotate. + return profile_name + + req_count = self.state.get('profile_cooldown_counts', {}).get(profile_name, 0) + + if req_count >= max_reqs: + logger.info(f"Profile '{profile_name}' reached lifetime request limit ({req_count}/{max_reqs}). Rotating to new generation for worker {worker_id}.") + new_gen = current_gen + 1 + generations[worker_id_str] = new_gen + # The request counts for the old profile are implicitly left behind. + # The new profile will start with a count of 0. + profile_name = f"{prefix}_{worker_id}_{new_gen}" + + return profile_name + + def record_profile_request(self, profile_name): + """Increments the request counter for a profile for the cooldown policy.""" + with self.lock: + if not profile_name: + return + counts = self.state.setdefault('profile_cooldown_counts', {}) + counts[profile_name] = counts.get(profile_name, 0) + 1 + + def record_proxy_usage(self, proxy_url): + """Records a request timestamp for a given proxy URL for statistical purposes.""" + if not proxy_url: + return + with self.lock: + now = time.time() + # Use a prefix to avoid collisions with profile names or other keys + tracker_key = f"proxy_{proxy_url}" + self.state['rate_limit_trackers'].setdefault(tracker_key, []).append(now) + + def check_and_update_download_rate_limit(self, proxy_url, policy): + """Checks download rate limits. Returns True if allowed, False otherwise.""" + with self.lock: + now = time.time() + d_policy = policy.get('download_policy', {}) + rate_limits = d_policy.get('rate_limits', {}) + + # Check per-IP limit + ip_limit = rate_limits.get('per_ip') + if ip_limit: + tracker_key = 'download_per_ip' # Use a distinct key + max_req = ip_limit.get('max_requests') + period_min = ip_limit.get('per_minutes') + if max_req and period_min: + timestamps = self.state['rate_limit_trackers'].get(tracker_key, []) + timestamps = [ts for ts in timestamps if now - ts < period_min * 60] + if len(timestamps) >= max_req: + logger.warning("Per-IP download rate limit reached. Skipping task.") + return False + self.state['rate_limit_trackers'][tracker_key] = timestamps + + # Check per-proxy limit + proxy_limit = rate_limits.get('per_proxy') + if proxy_limit and proxy_url: + tracker_key = f"download_proxy_{proxy_url}" + max_req = proxy_limit.get('max_requests') + period_min = proxy_limit.get('per_minutes') + if max_req and period_min: + timestamps = self.state['rate_limit_trackers'].get(tracker_key, []) + timestamps = [ts for ts in timestamps if now - ts < period_min * 60] + if len(timestamps) >= max_req: + logger.warning(f"Per-proxy download rate limit for '{proxy_url}' reached. Skipping task.") + return False + self.state['rate_limit_trackers'][tracker_key] = timestamps + + # If all checks pass, record the new request timestamp for all relevant trackers + if ip_limit and ip_limit.get('max_requests'): + self.state['rate_limit_trackers'].setdefault('download_per_ip', []).append(now) + if proxy_limit and proxy_limit.get('max_requests') and proxy_url: + self.state['rate_limit_trackers'].setdefault(f"download_proxy_{proxy_url}", []).append(now) + + return True + + def wait_for_proxy_cooldown(self, proxy_url, policy): + """If a per-proxy sleep is defined, wait until the cooldown period has passed.""" + with self.lock: + d_policy = policy.get('download_policy', {}) + sleep_duration = d_policy.get('sleep_per_proxy_seconds', 0) + if not proxy_url or not sleep_duration > 0: + return + + last_finish = self.state.setdefault('proxy_last_finish_time', {}).get(proxy_url, 0) + elapsed = time.time() - last_finish + + if elapsed < sleep_duration: + time_to_sleep = sleep_duration - elapsed + logger.info(f"Proxy '{proxy_url}' was used recently. Sleeping for {time_to_sleep:.2f}s.") + # Interruptible sleep + sleep_end_time = time.time() + time_to_sleep + while time.time() < sleep_end_time: + if shutdown_event.is_set(): + logger.info("Shutdown requested during proxy cooldown sleep.") + break + time.sleep(0.2) + + def update_proxy_finish_time(self, proxy_url): + """Updates the last finish time for a proxy.""" + with self.lock: + if not proxy_url: + return + self.state.setdefault('proxy_last_finish_time', {})[proxy_url] = time.time() + + def print_summary(self, policy=None): + """Print a summary of the test run.""" + with self.lock: + # --- Cumulative Stats from State --- + now = time.time() + rate_trackers = self.state.get('rate_limit_trackers', {}) + if rate_trackers: + logger.info("\n--- Cumulative Rate Summary (All Runs, updated at end of run) ---") + logger.info("This shows the total number of requests/downloads over various time windows, including previous runs.") + + fetch_trackers = {k: v for k, v in rate_trackers.items() if not k.startswith('download_')} + download_trackers = {k: v for k, v in rate_trackers.items() if k.startswith('download_')} + + def print_tracker_stats(trackers, tracker_type): + if not trackers: + logger.info(f"No historical {tracker_type} trackers found.") + return + + logger.info(f"Historical {tracker_type} Trackers:") + for key, timestamps in sorted(trackers.items()): + windows = { + 'last 10 min': 600, 'last 60 min': 3600, + 'last 6 hours': 21600, 'last 24 hours': 86400 + } + rates_str_parts = [] + for name, seconds in windows.items(): + count = sum(1 for ts in timestamps if now - ts <= seconds) + rate_rpm = (count / seconds) * 60 if seconds > 0 else 0 + rates_str_parts.append(f"{count} in {name} ({rate_rpm:.2f}/min)") + + # Clean up key for display + display_key = key.replace('download_', '').replace('per_ip', 'all_proxies/ips') + logger.info(f" - Tracker '{display_key}': " + ", ".join(rates_str_parts)) + + print_tracker_stats(fetch_trackers, "Fetch Request") + print_tracker_stats(download_trackers, "Download Attempt") + + if not self.events: + logger.info("\nNo new events were recorded in this session.") + return + + duration = time.time() - self.start_time + fetch_events = [e for e in self.events if e.get('type') == 'fetch'] + download_events = [e for e in self.events if e.get('type') != 'fetch'] + + logger.info("\n--- Test Summary (This Run) ---") + logger.info(f"Total duration: {duration:.2f} seconds") + logger.info(f"Total info.json requests (cumulative): {self.get_request_count()}") + + if policy: + logger.info("\n--- Test Configuration ---") + settings = policy.get('settings', {}) + d_policy = policy.get('download_policy', {}) + + if settings.get('urls_file'): + logger.info(f"URL source file: {settings['urls_file']}") + if settings.get('info_json_dir'): + logger.info(f"Info.json source dir: {settings['info_json_dir']}") + + if d_policy: + logger.info(f"Download formats: {d_policy.get('formats', 'N/A')}") + if d_policy.get('downloader'): + logger.info(f"Downloader: {d_policy.get('downloader')}") + if d_policy.get('downloader_args'): + logger.info(f"Downloader args: {d_policy.get('downloader_args')}") + if d_policy.get('pause_before_download_seconds'): + logger.info(f"Pause before download: {d_policy.get('pause_before_download_seconds')}s") + if d_policy.get('sleep_between_formats'): + sleep_cfg = d_policy.get('sleep_between_formats') + logger.info(f"Sleep between formats: {sleep_cfg.get('min_seconds', 0)}-{sleep_cfg.get('max_seconds', 0)}s") + + if fetch_events: + total_fetches = len(fetch_events) + successful_fetches = sum(1 for e in fetch_events if e['success']) + cancelled_fetches = sum(1 for e in fetch_events if e.get('error_type') == 'Cancelled') + failed_fetches = total_fetches - successful_fetches - cancelled_fetches + + logger.info("\n--- Fetch Summary (This Run) ---") + logger.info(f"Total info.json fetch attempts: {total_fetches}") + logger.info(f" - Successful: {successful_fetches}") + logger.info(f" - Failed: {failed_fetches}") + if cancelled_fetches > 0: + logger.info(f" - Cancelled: {cancelled_fetches}") + + completed_fetches = successful_fetches + failed_fetches + if completed_fetches > 0: + success_rate = (successful_fetches / completed_fetches) * 100 + logger.info(f"Success rate (of completed): {success_rate:.2f}%") + elif total_fetches > 0: + logger.info("Success rate: N/A (no tasks completed)") + + if duration > 1 and total_fetches > 0: + rpm = (total_fetches / duration) * 60 + logger.info(f"Actual fetch rate: {rpm:.2f} requests/minute") + + if failed_fetches > 0: + error_counts = collections.Counter( + e.get('error_type', 'Unknown') + for e in fetch_events if not e['success'] and e.get('error_type') != 'Cancelled' + ) + logger.info("Failure breakdown:") + for error_type, count in sorted(error_counts.items()): + logger.info(f" - {error_type}: {count}") + + profile_counts = collections.Counter(e.get('profile') for e in fetch_events if e.get('profile')) + if profile_counts: + logger.info("Requests per profile:") + for profile, count in sorted(profile_counts.items()): + logger.info(f" - {profile}: {count}") + + proxy_counts = collections.Counter(e.get('proxy_url') for e in fetch_events if e.get('proxy_url')) + if proxy_counts: + logger.info("Requests per proxy:") + for proxy, count in sorted(proxy_counts.items()): + logger.info(f" - {proxy}: {count}") + + if download_events: + total_attempts = len(download_events) + successes = sum(1 for e in download_events if e['success']) + cancelled = sum(1 for e in download_events if e.get('error_type') == 'Cancelled') + failures = total_attempts - successes - cancelled + + # --- Profile Association for Download Events --- + download_profiles = [e.get('profile') for e in download_events] + + # For download_only mode, we might need to fall back to regex extraction + # if the profile wasn't passed down (e.g., no profile grouping). + profile_regex = None + if policy: + settings = policy.get('settings', {}) + if settings.get('mode') == 'download_only': + profile_regex = settings.get('profile_extraction_regex') + + if profile_regex: + for i, e in enumerate(download_events): + if not download_profiles[i]: # If profile wasn't set in the event + path = Path(e.get('path', '')) + match = re.search(profile_regex, path.name) + if match and match.groups(): + download_profiles[i] = match.group(1) + + # Replace any remaining Nones with 'unknown_profile' + download_profiles = [p or 'unknown_profile' for p in download_profiles] + + num_profiles_used = len(set(p for p in download_profiles if p != 'unknown_profile')) + + logger.info("\n--- Download Summary (This Run) ---") + if policy: + workers = policy.get('execution_control', {}).get('workers', 'N/A') + logger.info(f"Workers configured: {workers}") + + logger.info(f"Profiles utilized for downloads: {num_profiles_used}") + logger.info(f"Total download attempts: {total_attempts}") + logger.info(f" - Successful: {successes}") + logger.info(f" - Failed: {failures}") + if cancelled > 0: + logger.info(f" - Cancelled: {cancelled}") + + completed_downloads = successes + failures + if completed_downloads > 0: + success_rate = (successes / completed_downloads) * 100 + logger.info(f"Success rate (of completed): {success_rate:.2f}%") + elif total_attempts > 0: + logger.info("Success rate: N/A (no tasks completed)") + + duration_hours = duration / 3600.0 + if duration > 1 and total_attempts > 0: + dpm = (total_attempts / duration) * 60 + logger.info(f"Actual overall download rate: {dpm:.2f} attempts/minute") + + total_bytes = sum(e.get('downloaded_bytes', 0) for e in download_events if e['success']) + if total_bytes > 0: + logger.info(f"Total data downloaded: {format_size(total_bytes)}") + + if failures > 0: + error_counts = collections.Counter( + e.get('error_type', 'Unknown') + for e in download_events if not e['success'] and e.get('error_type') != 'Cancelled' + ) + logger.info("Failure breakdown:") + for error_type, count in sorted(error_counts.items()): + logger.info(f" - {error_type}: {count}") + + # Add profile to each download event for easier counting + for i, e in enumerate(download_events): + e['profile'] = download_profiles[i] + + profile_counts = collections.Counter(e.get('profile') for e in download_events if e.get('profile')) + if profile_counts: + logger.info("Downloads per profile:") + for profile, count in sorted(profile_counts.items()): + rate_per_hour = (count / duration_hours) if duration_hours > 0 else 0 + logger.info(f" - {profile}: {count} attempts (avg this run: {rate_per_hour:.2f}/hour)") + + proxy_counts = collections.Counter(e.get('proxy_url') for e in download_events if e.get('proxy_url')) + if proxy_counts: + logger.info("Downloads per proxy:") + for proxy, count in sorted(proxy_counts.items()): + rate_per_hour = (count / duration_hours) if duration_hours > 0 else 0 + logger.info(f" - {proxy}: {count} attempts (avg this run: {rate_per_hour:.2f}/hour)") + + logger.info("--------------------") + + +def _run_download_logic(source, info_json_content, policy, state_manager, profile_name=None): + """Shared download logic for a single info.json.""" + proxy_url = None + if info_json_content: + try: + info_data = json.loads(info_json_content) + proxy_url = info_data.get('_proxy_url') + except (json.JSONDecodeError, AttributeError): + logger.warning(f"[{get_display_name(source)}] Could not parse info.json to get proxy for download controls.") + + if not state_manager.check_and_update_download_rate_limit(proxy_url, policy): + return [] + + state_manager.wait_for_proxy_cooldown(proxy_url, policy) + results = process_info_json_cycle(source, info_json_content, policy, state_manager, proxy_url=proxy_url, profile_name=profile_name) + state_manager.update_proxy_finish_time(proxy_url) + return results + + +def process_profile_task(profile_name, file_list, policy, state_manager, cycle_num): + """Worker task for a profile, processing its files sequentially.""" + logger.info(f"Worker {get_worker_id()} starting task for profile '{profile_name}' with {len(file_list)} files.") + all_results = [] + for i, file_path in enumerate(file_list): + if shutdown_event.is_set(): + logger.info(f"Shutdown requested, stopping task for profile '{profile_name}'.") + break + + try: + with open(file_path, 'r', encoding='utf-8') as f: + info_json_content = f.read() + except (IOError, FileNotFoundError) as e: + logger.error(f"[{get_display_name(file_path)}] Could not read info.json file: {e}") + continue # Skip this file + + results_for_file = _run_download_logic(file_path, info_json_content, policy, state_manager, profile_name=profile_name) + all_results.extend(results_for_file) + + # Check for stop conditions after processing each file + should_stop_profile = False + for result in results_for_file: + if not result['success']: + s_conditions = policy.get('stop_conditions', {}) + if s_conditions.get('on_failure') or \ + (s_conditions.get('on_http_403') and result['error_type'] == 'HTTP 403') or \ + (s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'): + logger.info(f"Stopping further processing for profile '{profile_name}' due to failure.") + should_stop_profile = True + break + if should_stop_profile: + break + + # Apply sleep between tasks for this profile + if i < len(file_list) - 1: + exec_control = policy.get('execution_control', {}) + sleep_cfg = exec_control.get('sleep_between_tasks', {}) + sleep_min = sleep_cfg.get('min_seconds', 0) + + if sleep_min > 0: + sleep_max = sleep_cfg.get('max_seconds') or sleep_min + sleep_duration = random.uniform(sleep_min, sleep_max) if sleep_max > sleep_min else sleep_min + + logger.debug(f"Profile '{profile_name}' sleeping for {sleep_duration:.2f}s before next file.") + # Interruptible sleep + sleep_end_time = time.time() + sleep_duration + while time.time() < sleep_end_time: + if shutdown_event.is_set(): + break + time.sleep(0.2) + + return all_results + + +def run_command(cmd, input_data=None, binary_stdout=False): + """ + Runs a command, captures its output, and returns status. + If binary_stdout is True, stdout is returned as bytes. Otherwise, both are decoded strings. + """ + logger.debug(f"Running command: {' '.join(cmd)}") + process = None + try: + # Always open in binary mode to handle both cases. We will decode later. + process = subprocess.Popen( + cmd, + stdin=subprocess.PIPE if input_data else None, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + preexec_fn=os.setsid # Start in a new process group to isolate from terminal signals + ) + with process_lock: + running_processes.add(process) + + stdout_capture = [] + stderr_capture = [] + + def read_pipe(pipe, capture_list, display_pipe=None): + """Reads a pipe line by line (as bytes), appending to a list and optionally displaying.""" + for line in iter(pipe.readline, b''): + capture_list.append(line) + if display_pipe: + # Decode for display + display_line = line.decode('utf-8', errors='replace') + display_pipe.write(display_line) + display_pipe.flush() + + # We must read stdout and stderr in parallel to prevent deadlocks. + stdout_thread = threading.Thread(target=read_pipe, args=(process.stdout, stdout_capture)) + # Display stderr in real-time as it often contains progress info. + stderr_thread = threading.Thread(target=read_pipe, args=(process.stderr, stderr_capture, sys.stderr)) + + stdout_thread.start() + stderr_thread.start() + + # Handle stdin after starting to read outputs to avoid deadlocks. + if input_data: + try: + process.stdin.write(input_data.encode('utf-8')) + process.stdin.close() + except (IOError, BrokenPipeError): + # This can happen if the process exits quickly or doesn't read stdin. + logger.debug(f"Could not write to stdin for command: {' '.join(cmd)}. Process may have already exited.") + + # Wait for the process to finish and for all output to be read. + retcode = process.wait() + stdout_thread.join() + stderr_thread.join() + + stdout_bytes = b"".join(stdout_capture) + stderr_bytes = b"".join(stderr_capture) + + stdout = stdout_bytes if binary_stdout else stdout_bytes.decode('utf-8', errors='replace') + stderr = stderr_bytes.decode('utf-8', errors='replace') + + return retcode, stdout, stderr + + except FileNotFoundError: + logger.error(f"Command not found: {cmd[0]}. Make sure it's in your PATH.") + return -1, "", f"Command not found: {cmd[0]}" + except Exception as e: + logger.error(f"An error occurred while running command: {' '.join(cmd)}. Error: {e}") + return -1, "", str(e) + finally: + if process: + with process_lock: + running_processes.discard(process) + + +def run_download_worker(info_json_path, info_json_content, format_to_download, policy, profile_name=None): + """ + Performs a single download attempt. Designed to be run in a worker thread. + """ + download_policy = policy.get('download_policy', {}) + settings = policy.get('settings', {}) + downloader = download_policy.get('downloader') + + # Get script command from settings, with fallback to download_policy for old format. + script_cmd_str = settings.get('download_script') + if not script_cmd_str: + script_cmd_str = download_policy.get('script') + + if script_cmd_str: + download_cmd = shlex.split(script_cmd_str) + elif downloader == 'aria2c_rpc': + download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'aria-rpc'] + elif downloader == 'native-cli': + download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'cli'] + else: + # Default to the new native-py downloader if downloader is 'native-py' or not specified. + download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'py'] + + download_cmd.extend(['-f', format_to_download]) + + if downloader == 'aria2c_rpc': + if download_policy.get('aria_host'): + download_cmd.extend(['--aria-host', str(download_policy['aria_host'])]) + if download_policy.get('aria_port'): + download_cmd.extend(['--aria-port', str(download_policy['aria_port'])]) + if download_policy.get('aria_secret'): + download_cmd.extend(['--aria-secret', str(download_policy['aria_secret'])]) + if download_policy.get('output_dir'): + download_cmd.extend(['--output-dir', str(download_policy['output_dir'])]) + if download_policy.get('aria_remote_dir'): + download_cmd.extend(['--remote-dir', str(download_policy['aria_remote_dir'])]) + if download_policy.get('aria_fragments_dir'): + download_cmd.extend(['--fragments-dir', str(download_policy['aria_fragments_dir'])]) + # For stress testing, waiting is the desired default to get a success/fail result. + # Allow disabling it by explicitly setting aria_wait: false in the policy. + if download_policy.get('aria_wait', True): + download_cmd.append('--wait') + + if download_policy.get('auto_merge_fragments'): + download_cmd.append('--auto-merge-fragments') + if download_policy.get('remove_fragments_after_merge'): + download_cmd.append('--remove-fragments-after-merge') + if download_policy.get('cleanup'): + download_cmd.append('--cleanup') + if download_policy.get('purge_on_complete'): + download_cmd.append('--purge-on-complete') + + downloader_args = download_policy.get('downloader_args') + proxy = download_policy.get('proxy') + if proxy: + # Note: proxy_rename is not supported for aria2c_rpc mode. + proxy_arg = f"--all-proxy {shlex.quote(str(proxy))}" + if downloader_args: + downloader_args = f"{downloader_args} {proxy_arg}" + else: + downloader_args = proxy_arg + + if downloader_args: + # For aria2c_rpc, the downloader_args value is passed directly to the script's --downloader-args option. + download_cmd.extend(['--downloader-args', downloader_args]) + elif downloader == 'native-cli': + # This is the logic for the legacy download_tool.py (yt-dlp CLI wrapper). + pause_seconds = download_policy.get('pause_before_download_seconds') + if pause_seconds and isinstance(pause_seconds, (int, float)) and pause_seconds > 0: + download_cmd.extend(['--pause', str(pause_seconds)]) + + if download_policy.get('continue_downloads'): + download_cmd.append('--download-continue') + + # Add proxy if specified directly in the policy + proxy = download_policy.get('proxy') + if proxy: + download_cmd.extend(['--proxy', str(proxy)]) + + proxy_rename = download_policy.get('proxy_rename') + if proxy_rename: + download_cmd.extend(['--proxy-rename', str(proxy_rename)]) + + extra_args = download_policy.get('extra_args') + if extra_args: + download_cmd.extend(shlex.split(extra_args)) + + # Note: 'downloader' here refers to yt-dlp's internal downloader, not our script. + # The policy key 'external_downloader' is more clear, but we support 'downloader' for backward compatibility. + ext_downloader = download_policy.get('external_downloader') or download_policy.get('downloader') + if ext_downloader and ext_downloader not in ['native-cli', 'native-py', 'aria2c_rpc']: + download_cmd.extend(['--downloader', str(ext_downloader)]) + + downloader_args = download_policy.get('downloader_args') + if downloader_args: + download_cmd.extend(['--downloader-args', str(downloader_args)]) + + if download_policy.get('merge_output_format'): + download_cmd.extend(['--merge-output-format', str(download_policy['merge_output_format'])]) + + if download_policy.get('merge_output_format'): + download_cmd.extend(['--merge-output-format', str(download_policy['merge_output_format'])]) + + if download_policy.get('cleanup'): + download_cmd.append('--cleanup') + else: + # This is the default logic for the new native-py downloader. + if download_policy.get('output_to_buffer'): + download_cmd.append('--output-buffer') + else: + # --output-dir is only relevant if not outputting to buffer. + if download_policy.get('output_dir'): + download_cmd.extend(['--output-dir', str(download_policy['output_dir'])]) + + if download_policy.get('temp_path'): + download_cmd.extend(['--temp-path', str(download_policy['temp_path'])]) + if download_policy.get('continue_downloads'): + download_cmd.append('--download-continue') + + pause_seconds = download_policy.get('pause_before_download_seconds') + if pause_seconds and isinstance(pause_seconds, (int, float)) and pause_seconds > 0: + download_cmd.extend(['--pause', str(pause_seconds)]) + + proxy = download_policy.get('proxy') + if proxy: + download_cmd.extend(['--proxy', str(proxy)]) + + proxy_rename = download_policy.get('proxy_rename') + if proxy_rename: + download_cmd.extend(['--proxy-rename', str(proxy_rename)]) + + extra_args = download_policy.get('extra_args') + if extra_args: + download_cmd.extend(['--extra-ytdlp-args', str(extra_args)]) + + # Pass through downloader settings for yt-dlp to use + # e.g. to tell yt-dlp to use aria2c as its backend + ext_downloader = download_policy.get('external_downloader') + if ext_downloader: + download_cmd.extend(['--downloader', str(ext_downloader)]) + + downloader_args = download_policy.get('downloader_args') + if downloader_args: + download_cmd.extend(['--downloader-args', str(downloader_args)]) + + worker_id = get_worker_id() + display_name = get_display_name(info_json_path) + profile_log_part = f" [Profile: {profile_name}]" if profile_name else "" + log_prefix = f"[Worker {worker_id}]{profile_log_part} [{display_name} @ {format_to_download}]" + logger.info(f"{log_prefix} Kicking off download process...") + + temp_info_file_path = None + try: + if isinstance(info_json_path, Path) and info_json_path.exists(): + # The info.json is already in a file, pass its path directly. + download_cmd.extend(['--load-info-json', str(info_json_path)]) + else: + # The info.json content is in memory, so write it to a temporary file. + import tempfile + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', encoding='utf-8') as temp_f: + temp_f.write(info_json_content) + temp_info_file_path = temp_f.name + download_cmd.extend(['--load-info-json', temp_info_file_path]) + + cmd_str_for_log = ' '.join(shlex.quote(s) for s in download_cmd) + logger.info(f"{log_prefix} Running download command: {cmd_str_for_log}") + output_to_buffer = download_policy.get('output_to_buffer', False) + retcode, stdout, stderr = run_command(download_cmd, binary_stdout=output_to_buffer) + finally: + if temp_info_file_path and os.path.exists(temp_info_file_path): + os.unlink(temp_info_file_path) + + is_403_error = "HTTP Error 403" in stderr + is_timeout_error = "Read timed out" in stderr + output_to_buffer = download_policy.get('output_to_buffer', False) + + result = { + 'type': 'download', + 'path': str(info_json_path), + 'format': format_to_download, + 'success': retcode == 0, + 'error_type': None, + 'details': '', + 'downloaded_bytes': 0, + 'profile': profile_name + } + + if retcode == 0: + details_str = "OK" + size_in_bytes = 0 + if output_to_buffer: + # The most accurate size is the length of the stdout buffer. + size_in_bytes = len(stdout) # stdout is bytes + details_str += f" (Buffered {format_size(size_in_bytes)})" + else: + size_match = re.search(r'\[download\]\s+100%\s+of\s+~?([0-9.]+)(B|KiB|MiB|GiB)', stderr) + if size_match: + value = float(size_match.group(1)) + unit = size_match.group(2) + multipliers = {"B": 1, "KiB": 1024, "MiB": 1024**2, "GiB": 1024**3} + size_in_bytes = int(value * multipliers.get(unit, 1)) + details_str += f" ({size_match.group(1)}{unit})" + + result['downloaded_bytes'] = size_in_bytes + result['details'] = details_str + else: + # Check both stdout and stderr for error messages, as logging might be directed to stdout. + full_output = f"{stdout}\n{stderr}" + error_lines = [line for line in full_output.strip().split('\n') if 'ERROR:' in line] + result['details'] = error_lines[-1].strip() if error_lines else "Unknown error" + + if is_403_error: + result['error_type'] = 'HTTP 403' + elif is_timeout_error: + result['error_type'] = 'Timeout' + else: + result['error_type'] = f'Exit Code {retcode}' + + return result + + +def process_info_json_cycle(path, content, policy, state_manager, proxy_url=None, profile_name=None): + """ + Processes one info.json file for one cycle, downloading selected formats. + """ + results = [] + display_name = get_display_name(path) + d_policy = policy.get('download_policy', {}) + s_conditions = policy.get('stop_conditions', {}) + format_selection = d_policy.get('formats', '') + + try: + info_data = json.loads(content) + available_formats = [f['format_id'] for f in info_data.get('formats', [])] + if not available_formats: + logger.warning(f"[{display_name}] No formats found in info.json. Skipping.") + return [] + + formats_to_test = [] + if format_selection == 'all': + formats_to_test = available_formats + elif format_selection.startswith('random:'): + percent = float(format_selection.split(':')[1].rstrip('%')) + count = max(1, int(len(available_formats) * (percent / 100.0))) + formats_to_test = random.sample(available_formats, k=count) + elif format_selection.startswith('random_from:'): + choices = [f.strip() for f in format_selection.split(':', 1)[1].split(',')] + valid_choices = [f for f in choices if f in available_formats] + if valid_choices: + formats_to_test = [random.choice(valid_choices)] + else: + requested_formats = [f.strip() for f in format_selection.split(',') if f.strip()] + formats_to_test = [] + for req_fmt in requested_formats: + # Check for exact match first + if req_fmt in available_formats: + formats_to_test.append(req_fmt) + continue + + # If no exact match, check for formats that start with this ID + '-' + # e.g., req_fmt '140' should match '140-0' + prefix_match = f"{req_fmt}-" + first_match = next((af for af in available_formats if af.startswith(prefix_match)), None) + + if first_match: + logger.info(f"[{display_name}] Requested format '{req_fmt}' not found. Using first available match: '{first_match}'.") + formats_to_test.append(first_match) + else: + # This could be a complex selector like 'bestvideo' or '299/298', so keep it. + if req_fmt not in available_formats: + logger.warning(f"[{display_name}] Requested format '{req_fmt}' not found in available formats.") + formats_to_test.append(req_fmt) + + except json.JSONDecodeError: + logger.error(f"[{display_name}] Failed to parse info.json. Skipping.") + return [] + + for i, format_id in enumerate(formats_to_test): + if shutdown_event.is_set(): + logger.info(f"Shutdown requested, stopping further format tests for {display_name}.") + break + + # Check if the format URL is expired before attempting to download + format_details = next((f for f in info_data.get('formats', []) if f.get('format_id') == format_id), None) + if format_details and 'url' in format_details: + parsed_url = urlparse(format_details['url']) + query_params = parse_qs(parsed_url.query) + expire_ts_str = query_params.get('expire', [None])[0] + if expire_ts_str and expire_ts_str.isdigit(): + expire_ts = int(expire_ts_str) + if expire_ts < time.time(): + logger.warning(f"[{display_name}] Skipping format '{format_id}' because its URL is expired.") + result = { + 'type': 'download', 'path': str(path), 'format': format_id, + 'success': True, 'error_type': 'Skipped', + 'details': 'Download URL is expired', 'downloaded_bytes': 0 + } + if proxy_url: + result['proxy_url'] = proxy_url + state_manager.log_event(result) + results.append(result) + continue # Move to the next format + + result = run_download_worker(path, content, format_id, policy, profile_name=profile_name) + if proxy_url: + result['proxy_url'] = proxy_url + state_manager.log_event(result) + results.append(result) + + worker_id = get_worker_id() + status = "SUCCESS" if result['success'] else f"FAILURE ({result['error_type']})" + profile_log_part = f" [Profile: {profile_name}]" if profile_name else "" + logger.info(f"[Worker {worker_id}]{profile_log_part} Result for {display_name} (format {format_id}): {status} - {result.get('details', 'OK')}") + + if not result['success']: + if s_conditions.get('on_failure') or \ + (s_conditions.get('on_http_403') and result['error_type'] == 'HTTP 403') or \ + (s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'): + logger.info(f"Stopping further format tests for {display_name} in this cycle due to failure.") + break + + sleep_cfg = d_policy.get('sleep_between_formats', {}) + sleep_min = sleep_cfg.get('min_seconds', 0) + if sleep_min > 0 and i < len(formats_to_test) - 1: + sleep_max = sleep_cfg.get('max_seconds') or sleep_min + if sleep_max > sleep_min: + sleep_duration = random.uniform(sleep_min, sleep_max) + else: + sleep_duration = sleep_min + + logger.debug(f"Sleeping for {sleep_duration:.2f}s between formats for {display_name}.") + # Interruptible sleep + sleep_end_time = time.time() + sleep_duration + while time.time() < sleep_end_time: + if shutdown_event.is_set(): + break + time.sleep(0.2) + + return results + + +def update_dict(d, u): + """Recursively update a dictionary.""" + for k, v in u.items(): + if isinstance(v, collections.abc.Mapping): + d[k] = update_dict(d.get(k, {}), v) + else: + d[k] = v + return d + + +def load_policy(policy_file, policy_name=None): + """Load a policy from a YAML file.""" + try: + with open(policy_file, 'r', encoding='utf-8') as f: + # If a policy name is given, look for that specific document + if policy_name: + docs = list(yaml.safe_load_all(f)) + for doc in docs: + if isinstance(doc, dict) and doc.get('name') == policy_name: + return doc + raise ValueError(f"Policy '{policy_name}' not found in {policy_file}") + # Otherwise, load the first document + return yaml.safe_load(f) + except (IOError, yaml.YAMLError, ValueError) as e: + logger.error(f"Failed to load policy file {policy_file}: {e}") + sys.exit(1) + + +def apply_overrides(policy, overrides): + """Apply command-line overrides to the policy.""" + for override in overrides: + try: + key, value = override.split('=', 1) + keys = key.split('.') + + # Try to parse as JSON/YAML if it looks like a list or dict, otherwise treat as scalar + if (value.startswith('[') and value.endswith(']')) or \ + (value.startswith('{') and value.endswith('}')): + try: + value = yaml.safe_load(value) + except yaml.YAMLError: + logger.warning(f"Could not parse override value '{value}' as YAML. Treating as a string.") + else: + # Try to auto-convert scalar value type + if value.lower() == 'true': + value = True + elif value.lower() == 'false': + value = False + elif value.lower() == 'null': + value = None + else: + try: + value = int(value) + except ValueError: + try: + value = float(value) + except ValueError: + pass # Keep as string + + d = policy + for k in keys[:-1]: + d = d.setdefault(k, {}) + d[keys[-1]] = value + except ValueError: + logger.error(f"Invalid override format: '{override}'. Use 'key.subkey=value'.") + sys.exit(1) + return policy + + +def display_effective_policy(policy, name, sources=None, profile_names=None, original_workers_setting=None): + """Prints a human-readable summary of the effective policy.""" + logger.info(f"--- Effective Policy: {name} ---") + settings = policy.get('settings', {}) + exec_control = policy.get('execution_control', {}) + + logger.info(f"Mode: {settings.get('mode', 'full_stack')}") + if profile_names: + num_profiles = len(profile_names) + logger.info(f"Profiles found: {num_profiles}") + if num_profiles > 0: + # Sort profiles for consistent display, show top 10 + sorted_profiles = sorted(profile_names) + profiles_to_show = sorted_profiles[:10] + logger.info(f" (e.g., {', '.join(profiles_to_show)}{'...' if num_profiles > 10 else ''})") + + workers_display = str(exec_control.get('workers', 1)) + if original_workers_setting == 'auto': + workers_display = f"auto (calculated: {workers_display})" + logger.info(f"Workers: {workers_display}") + + sleep_cfg = exec_control.get('sleep_between_tasks', {}) + sleep_min = sleep_cfg.get('min_seconds') + if sleep_min is not None: + sleep_max = sleep_cfg.get('max_seconds') or sleep_min + if sleep_max > sleep_min: + logger.info(f"Sleep between tasks (per worker): {sleep_min}-{sleep_max}s (random)") + else: + logger.info(f"Sleep between tasks (per worker): {sleep_min}s") + + run_until = exec_control.get('run_until', {}) + run_conditions = [] + if 'minutes' in run_until: + run_conditions.append(f"for {run_until['minutes']} minutes") + if 'requests' in run_until: + run_conditions.append(f"until {run_until['requests']} total requests") + if 'cycles' in run_until: + run_conditions.append(f"for {run_until['cycles']} cycles") + + if run_conditions: + logger.info(f"Run condition: Stop after running {' or '.join(run_conditions)}.") + if 'minutes' in run_until and 'cycles' not in run_until: + logger.info("Will continuously cycle through sources until time limit is reached.") + else: + logger.warning("WARNING: No 'run_until' condition is set. This test will run forever unless stopped manually.") + logger.info("Run condition: No stop condition defined, will run indefinitely (until Ctrl+C).") + + # --- Rate Calculation --- + if sources: + workers = exec_control.get('workers', 1) + num_sources = len(profile_names) if profile_names else len(sources) + + min_sleep = sleep_cfg.get('min_seconds', 0) + max_sleep = sleep_cfg.get('max_seconds') or min_sleep + avg_sleep_per_task = (min_sleep + max_sleep) / 2 + + # Assume an average task duration. This is a major assumption. + mode = settings.get('mode', 'full_stack') + assumptions = exec_control.get('assumptions', {}) + + assumed_fetch_duration = 0 + if mode in ['full_stack', 'fetch_only']: + assumed_fetch_duration = assumptions.get('fetch_task_duration', 12 if mode == 'full_stack' else 3) + + assumed_download_duration = 0 + if mode in ['full_stack', 'download_only']: + # This assumes the total time to download all formats for a single source. + assumed_download_duration = assumptions.get('download_task_duration', 60) + + total_assumed_task_duration = assumed_fetch_duration + assumed_download_duration + + if workers > 0 and total_assumed_task_duration > 0: + total_time_per_task = total_assumed_task_duration + avg_sleep_per_task + tasks_per_minute_per_worker = 60 / total_time_per_task + total_tasks_per_minute = tasks_per_minute_per_worker * workers + + logger.info("--- Rate Estimation ---") + logger.info(f"Source count: {num_sources}") + if mode in ['full_stack', 'fetch_only']: + logger.info(f"Est. fetch time per source: {assumed_fetch_duration}s (override via execution_control.assumptions.fetch_task_duration)") + if mode in ['full_stack', 'download_only']: + logger.info(f"Est. download time per source: {assumed_download_duration}s (override via execution_control.assumptions.download_task_duration)") + logger.info(" (Note: This assumes total time for all formats per source)") + + logger.info(f"Est. sleep per task: {avg_sleep_per_task:.1f}s") + logger.info(f"==> Expected task rate: ~{total_tasks_per_minute:.2f} tasks/minute ({workers} workers * {tasks_per_minute_per_worker:.2f} tasks/min/worker)") + + target_rate_cfg = exec_control.get('target_rate', {}) + target_reqs = target_rate_cfg.get('requests') + target_mins = target_rate_cfg.get('per_minutes') + if target_reqs and target_mins: + target_rpm = target_reqs / target_mins + logger.info(f"Target rate: {target_rpm:.2f} tasks/minute") + if total_tasks_per_minute < target_rpm * 0.8: + logger.warning("Warning: Expected rate is significantly lower than target rate.") + logger.warning("Consider increasing workers, reducing sleep, or checking task performance.") + + logger.info("---------------------------------") + time.sleep(2) # Give user time to read + + +def add_stress_policy_parser(subparsers): + """Add the parser for the 'stress-policy' command.""" + parser = subparsers.add_parser( + 'stress-policy', + description="The primary, policy-driven stress-testing orchestrator.\nIt runs complex, multi-stage stress tests based on a YAML policy file.\nUse '--list-policies' to see available pre-configured scenarios.\n\nModes supported:\n- full_stack: Generate info.json and then download from it.\n- fetch_only: Only generate info.json files.\n- download_only: Only download from existing info.json files.", + formatter_class=argparse.RawTextHelpFormatter, + help='Run advanced, policy-driven stress tests (recommended).', + epilog=""" +Examples: + +1. Fetch info.jsons for a TV client with a single profile and a rate limit: + ytops-client stress-policy --policy policies/1_fetch_only_policies.yaml \\ + --policy-name tv_downgraded_single_profile \\ + --set settings.urls_file=my_urls.txt \\ + --set execution_control.run_until.minutes=30 + # This runs a 'fetch_only' test using the 'tv_downgraded' client. It uses a single, + # static profile for all requests and enforces a safety limit of 450 requests per hour. + +2. Fetch info.jsons for an Android client using cookies for authentication: + ytops-client stress-policy --policy policies/1_fetch_only_policies.yaml \\ + --policy-name android_sdkless_with_cookies \\ + --set settings.urls_file=my_urls.txt \\ + --set info_json_generation_policy.request_params.cookies_file_path=/path/to/my_cookies.txt + # This demonstrates an authenticated 'fetch_only' test. It passes the path to a + # Netscape cookie file, which the server will use for the requests. + +3. Download from a folder of info.jsons, grouped by profile, with auto-workers: + ytops-client stress-policy --policy policies/2_download_only_policies.yaml \\ + --policy-name basic_profile_aware_download \\ + --set settings.info_json_dir=/path/to/my/infojsons + # This runs a 'download_only' test. It scans a directory, extracts profile names from + # the filenames (e.g., 'tv_user_1' from '...-VIDEOID-tv_user_1.json'), and groups + # them. 'workers=auto' sets the number of workers to the number of unique profiles found. + +4. Full-stack test with multiple workers and profile rotation: + ytops-client stress-policy --policy policies/3_full_stack_policies.yaml \\ + --policy-name tv_simply_profile_rotation \\ + --set settings.urls_file=my_urls.txt \\ + --set execution_control.workers=4 \\ + --set settings.profile_management.max_requests_per_profile=500 + # This runs a 'full_stack' test with 4 parallel workers. Each worker gets a unique + # profile (e.g., tv_simply_user_0_0, tv_simply_user_1_0, etc.). After a profile is + # used 500 times, it is retired, and a new "generation" is created (e.g., tv_simply_user_0_1). + +5. Full-stack authenticated test with a pool of profiles and corresponding cookie files: + ytops-client stress-policy --policy policies/3_full_stack_policies.yaml \\ + --policy-name mweb_multi_profile_with_cookies \\ + --set settings.urls_file=my_urls.txt \\ + --set settings.profile_management.cookie_files='["/path/c1.txt","/path/c2.txt"]' + # This runs a 'full_stack' test using a pool of profiles (e.g., mweb_user_0, mweb_user_1). + # It uses the 'cookie_files' list to assign a specific cookie file to each profile in the + # pool, enabling multi-account authenticated testing. Note the JSON/YAML list format for the override. + +6. Full-stack test submitting downloads to an aria2c RPC server: + ytops-client stress-policy --policy policies/3_full_stack_policies.yaml \\ + --policy-name tv_simply_profile_rotation_aria2c_rpc \\ + --set settings.urls_file=my_urls.txt \\ + --set download_policy.aria_host=192.168.1.100 \\ + --set download_policy.aria_port=6801 + # This runs a test where downloads are not performed by the worker itself, but are + # sent to a remote aria2c daemon. The policy specifies 'downloader: aria2c_rpc' + # and provides connection details. This is useful for offloading download traffic. + +-------------------------------------------------------------------------------- +Overridable Policy Parameters via --set: + + Key Description + -------------------------------------- ------------------------------------------------ + [settings] + settings.mode Test mode: 'full_stack', 'fetch_only', or 'download_only'. + settings.urls_file Path to file with URLs/video IDs. + settings.info_json_dir Path to directory with existing info.json files. + settings.profile_extraction_regex For 'download_only' mode, a regex to extract profile names from info.json filenames. The first capture group is used as the profile name. E.g., '.*-(.*?).json'. This enables profile-aware sequential downloading. + settings.info_json_dir_sample_percent Randomly sample this %% of files from the directory (for 'once' scan mode). + settings.directory_scan_mode For 'download_only': 'once' (default) or 'continuous' to watch for new files. + settings.mark_processed_files For 'continuous' scan mode: if true, rename processed files to '*..processed' to avoid reprocessing. + settings.max_files_per_cycle For 'continuous' scan mode: max new files to process per cycle. + settings.sleep_if_no_new_files_seconds For 'continuous' scan mode: seconds to sleep if no new files are found (default: 10). + settings.profile_prefix (Legacy) Prefix for profile names (e.g., 'test_user'). + settings.profile_pool (Legacy) Size of the profile pool. + settings.profile_mode Profile strategy. 'per_request' (legacy), 'per_worker' (legacy), or 'per_worker_with_rotation' (requires profile_management). + settings.info_json_script Command to run the info.json generation script (e.g., 'bin/ytops-client get-info'). + settings.save_info_json_dir If set, save all successfully generated info.json files to this directory. + + [settings.profile_management] (New, preferred method for profile control) + profile_management.prefix Prefix for profile names (e.g., 'dyn_user'). + profile_management.suffix Suffix for profile names. Set to 'auto' for a timestamp, or provide a string. + profile_management.initial_pool_size The number of profiles to start with. + profile_management.auto_expand_pool If true, create new profiles when the initial pool is exhausted (all sleeping). + profile_management.max_requests_per_profile Max requests a profile can make before it must 'sleep'. + profile_management.sleep_minutes_on_exhaustion How many minutes a profile 'sleeps' after hitting its request limit. + profile_management.cookie_files A list of paths to cookie files. Used to assign a unique cookie file to each profile in a pool. + + [execution_control] + execution_control.workers Number of parallel worker threads. Set to "auto" to calculate from target_rate or number of profiles. + execution_control.auto_workers_max The maximum number of workers to use when 'workers' is 'auto' in profile-aware download mode (default: 8). + execution_control.target_rate.requests Target requests for 'auto' workers calculation. + execution_control.target_rate.per_minutes Period in minutes for target_rate. + execution_control.run_until.minutes Stop test after N minutes. Will continuously cycle through sources. + execution_control.run_until.cycles Stop test after N cycles. A cycle is one full pass through all sources. + execution_control.run_until.requests Stop test after N total info.json requests (cumulative across runs). + execution_control.sleep_between_tasks.min_seconds Min sleep time between tasks, per worker. + + [info_json_generation_policy] + info_json_generation_policy.client Client to use (e.g., 'mweb', 'tv_camoufox'). + info_json_generation_policy.auth_host Host for the auth/Thrift service. + info_json_generation_policy.auth_port Port for the auth/Thrift service. + info_json_generation_policy.assigned_proxy_url A specific proxy to use for a request, overriding the server's proxy pool. + info_json_generation_policy.proxy_rename Regex substitution for the assigned proxy URL (e.g., 's/old/new/'). + info_json_generation_policy.command_template A full command template for the info.json script. Overrides other keys. + info_json_generation_policy.rate_limits.per_ip.max_requests Max requests for the given time period from one IP. + info_json_generation_policy.rate_limits.per_ip.per_minutes Time period in minutes for the per_ip rate limit. + info_json_generation_policy.rate_limits.per_profile.max_requests Max requests for a single profile in a time period. + info_json_generation_policy.rate_limits.per_profile.per_minutes Time period in minutes for the per_profile rate limit. + info_json_generation_policy.client_rotation_policy.major_client The primary client to use for most requests. + info_json_generation_policy.client_rotation_policy.refresh_client The client to use periodically to refresh context. + info_json_generation_policy.client_rotation_policy.refresh_every.requests Trigger refresh client after N requests for a profile. + + [download_policy] + download_policy.formats Formats to download (e.g., '18,140', 'random:50%%'). + download_policy.downloader Orchestrator script to use: 'native-py' (default, Python lib), 'native-cli' (legacy CLI wrapper), or 'aria2c_rpc'. + download_policy.external_downloader For 'native-py' or default, the backend yt-dlp should use (e.g., 'aria2c', 'native'). + download_policy.downloader_args Arguments for the external_downloader. For yt-dlp, e.g., 'aria2c:-x 8'. + download_policy.merge_output_format Container to merge to (e.g., 'mkv'). Defaults to 'mp4' via cli.config. + download_policy.temp_path For 'native-py', path to a directory for temporary files (e.g., a RAM disk like /dev/shm). + download_policy.output_to_buffer For 'native-py', download to an in-memory buffer and pipe to stdout instead of saving to a file (true/false). Best for single-file formats. + download_policy.proxy Proxy for direct downloads (e.g., "socks5://127.0.0.1:1080"). + download_policy.proxy_rename Regex substitution for the proxy URL (e.g., 's/old/new/'). + download_policy.pause_before_download_seconds Pause for N seconds before starting each download attempt. + download_policy.continue_downloads Enable download continuation (true/false). + download_policy.cleanup After success: for native downloaders, rename and truncate file to 0 bytes; for 'aria2c_rpc', remove file(s) from filesystem. + download_policy.extra_args A string of extra arguments for the download script (e.g., "--limit-rate 5M"). + download_policy.sleep_per_proxy_seconds Cooldown in seconds between downloads on the same proxy. + download_policy.rate_limits.per_proxy.max_requests Max downloads for a single proxy in a time period. + download_policy.rate_limits.per_proxy.per_minutes Time period in minutes for the per_proxy download rate limit. + # For downloader: 'aria2c_rpc' + download_policy.aria_host Hostname of the aria2c RPC server. + download_policy.aria_port Port of the aria2c RPC server. + download_policy.aria_secret Secret token for the aria2c RPC server. + download_policy.aria_wait Wait for aria2c downloads to complete (true/false). + download_policy.cleanup Remove downloaded file(s) from the filesystem on success. Requires script access to the download directory. + download_policy.purge_on_complete On success, purge ALL completed/failed downloads from aria2c history. Use as a workaround for older aria2c versions where targeted removal fails. + download_policy.output_dir Output directory for downloads. + download_policy.aria_remote_dir The absolute download path on the remote aria2c host. + download_policy.aria_fragments_dir The local path to find fragments for merging (if different from output_dir). + download_policy.auto_merge_fragments For fragmented downloads, automatically merge parts after download (true/false). Requires aria_wait=true. + download_policy.remove_fragments_after_merge For fragmented downloads, delete fragment files after a successful merge (true/false). Requires auto_merge_fragments=true. + + [stop_conditions] + stop_conditions.on_failure Stop on any download failure (true/false). + stop_conditions.on_http_403 Stop on any HTTP 403 error (true/false). + stop_conditions.on_error_rate.max_errors Stop test if more than N errors (of any type) occur within the time period. + stop_conditions.on_error_rate.per_minutes Time period in minutes for the error rate calculation. + stop_conditions.on_cumulative_403.max_errors Stop test if more than N HTTP 403 errors occur within the time period. + stop_conditions.on_cumulative_403.per_minutes Time period in minutes for the cumulative 403 calculation. + stop_conditions.on_quality_degradation.trigger_if_missing_formats A format ID or comma-separated list of IDs. Triggers if any are missing. + stop_conditions.on_quality_degradation.max_triggers Stop test if quality degradation is detected N times. + stop_conditions.on_quality_degradation.per_minutes Time period in minutes for the quality degradation calculation. +-------------------------------------------------------------------------------- +""" + ) + parser.add_argument('--policy', help='Path to the YAML policy file. Required unless --list-policies is used.') + parser.add_argument('--policy-name', help='Name of the policy to run from a multi-policy file (if it contains "---" separators).') + parser.add_argument('--list-policies', action='store_true', help='List all available policies from the default policies directory and exit.') + parser.add_argument('--show-overrides', action='store_true', help='Load the specified policy and print all its defined values as a single-line of --set arguments, then exit.') + parser.add_argument('--set', action='append', default=[], help="Override a policy setting using 'key.subkey=value' format.\n(e.g., --set execution_control.workers=5)") + parser.add_argument('--verbose', action='store_true', help='Enable verbose output for the orchestrator and underlying scripts.') + parser.add_argument('--dry-run', action='store_true', help='Print the effective policy and exit without running the test.') + return parser + + +def list_policies(): + """Scans the policies directory and prints a list of available policies.""" + script_dir = os.path.dirname(os.path.abspath(__file__)) + project_root = os.path.abspath(os.path.join(script_dir, '..')) + policies_dir = os.path.join(project_root, 'policies') + + if not os.path.isdir(policies_dir): + print(f"Error: Policies directory not found at '{policies_dir}'", file=sys.stderr) + return 1 + + print("Available Policies:") + print("=" * 20) + + policy_files = sorted(Path(policies_dir).glob('*.yaml')) + if not policy_files: + print("No policy files (.yaml) found.") + return 0 + + for policy_file in policy_files: + print(f"\n--- File: {policy_file.relative_to(project_root)} ---") + try: + with open(policy_file, 'r', encoding='utf-8') as f: + content = f.read() + + # Split into documents. The separator is a line that is exactly '---'. + documents = re.split(r'^\-\-\-$', content, flags=re.MULTILINE) + + found_any_in_file = False + for doc in documents: + doc = doc.strip() + if not doc: + continue + + lines = doc.split('\n') + policy_name = None + description_lines = [] + + # Find name and description + for i, line in enumerate(lines): + if line.strip().startswith('name:'): + policy_name = line.split(':', 1)[1].strip() + + # Look backwards for comments + j = i - 1 + current_desc_block = [] + while j >= 0 and lines[j].strip().startswith('#'): + comment = lines[j].strip().lstrip('#').strip() + current_desc_block.insert(0, comment) + j -= 1 + + if current_desc_block: + description_lines = current_desc_block + break + + if policy_name: + found_any_in_file = True + print(f" - Name: {policy_name}") + if description_lines: + # Heuristic to clean up "Policy: " prefix + if description_lines[0].lower().startswith('policy:'): + description_lines[0] = description_lines[0][len('policy:'):].strip() + + print(f" Description: {description_lines[0]}") + for desc_line in description_lines[1:]: + print(f" {desc_line}") + else: + print(" Description: (No description found)") + + relative_path = policy_file.relative_to(project_root) + print(f" Usage: --policy {relative_path} --policy-name {policy_name}") + + if not found_any_in_file: + print(" (No named policies found in this file)") + + except Exception as e: + print(f" Error parsing {policy_file.name}: {e}") + + return 0 + + +def main_stress_policy(args): + """Main logic for the 'stress-policy' command.""" + if args.list_policies: + return list_policies() + + if not args.policy: + print("Error: --policy is required unless using --list-policies.", file=sys.stderr) + return 1 + + # Handle --show-overrides early, as it doesn't run the test. + if args.show_overrides: + policy = load_policy(args.policy, args.policy_name) + if not policy: + return 1 # load_policy prints its own error + print_policy_overrides(policy) + return 0 + + log_level = logging.DEBUG if args.verbose else logging.INFO + log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' if args.verbose else '%(asctime)s - %(message)s' + date_format = None if args.verbose else '%H:%M:%S' + logging.basicConfig(level=log_level, format=log_format, datefmt=date_format, stream=sys.stdout) + + policy = load_policy(args.policy, args.policy_name) + policy = apply_overrides(policy, args.set) + + policy_name = policy.get('name', args.policy_name or Path(args.policy).stem) + + state_manager = StateManager(policy_name) + + # --- Graceful shutdown handler --- + def shutdown_handler(signum, frame): + if not shutdown_event.is_set(): + logger.info(f"\nSignal {signum} received, shutting down gracefully...") + shutdown_event.set() + + # Save state immediately to prevent loss on interrupt. + logger.info("Attempting to save state before shutdown...") + state_manager.close() + + # Kill running subprocesses to unblock workers + with process_lock: + if running_processes: + logger.info(f"Terminating {len(running_processes)} running subprocess(es)...") + for p in running_processes: + try: + # Kill the entire process group to ensure child processes (like yt-dlp) are terminated. + os.killpg(os.getpgid(p.pid), signal.SIGKILL) + except (ProcessLookupError, PermissionError): + pass # Process already finished or we lack permissions + logger.info("Subprocesses terminated. Waiting for workers to finish. Press Ctrl+C again to force exit.") + else: + logger.info("Second signal received, forcing exit.") + # Use os._exit for a hard exit that doesn't run cleanup handlers, + # which can deadlock if locks are held. + os._exit(1) + + signal.signal(signal.SIGINT, shutdown_handler) + signal.signal(signal.SIGTERM, shutdown_handler) + + settings = policy.get('settings', {}) + + # --- Load sources based on mode --- + mode = settings.get('mode', 'full_stack') + sources = [] # This will be a list of URLs or Path objects + if mode in ['full_stack', 'fetch_only']: + urls_file = settings.get('urls_file') + if not urls_file: + logger.error("Policy mode requires 'settings.urls_file'.") + return 1 + try: + with open(urls_file, 'r', encoding='utf-8') as f: + content = f.read() + try: + data = json.loads(content) + if isinstance(data, list) and all(isinstance(item, str) for item in data): + sources = data + logger.info(f"Loaded {len(sources)} URLs/IDs from JSON array in {urls_file}.") + else: + logger.error(f"URL file '{urls_file}' is valid JSON but not an array of strings.") + return 1 + except json.JSONDecodeError: + sources = [line.strip() for line in content.splitlines() if line.strip()] + logger.info(f"Loaded {len(sources)} URLs/IDs from text file {urls_file}.") + except IOError as e: + logger.error(f"Failed to read urls_file {urls_file}: {e}") + return 1 + + # Clean up URLs/IDs which might have extra quotes, commas, or brackets from copy-pasting + cleaned_sources = [] + for source in sources: + cleaned_source = source.strip().rstrip(',').strip().strip('\'"[]').strip() + if cleaned_source: + cleaned_sources.append(cleaned_source) + + if len(cleaned_sources) != len(sources): + logger.info(f"Cleaned URL list, removed {len(sources) - len(cleaned_sources)} empty or invalid entries.") + + sources = cleaned_sources + elif mode == 'download_only': + # If not in continuous mode, load sources once at the start. + # In continuous mode, `sources` is populated at the start of each cycle. + if settings.get('directory_scan_mode') != 'continuous': + info_json_dir = settings.get('info_json_dir') + if not info_json_dir: + logger.error("Policy mode 'download_only' requires 'settings.info_json_dir'.") + return 1 + try: + all_files = sorted(Path(info_json_dir).glob('*.json')) + sample_percent = settings.get('info_json_dir_sample_percent') + if sample_percent and 0 < sample_percent <= 100: + sample_count = int(len(all_files) * (sample_percent / 100.0)) + num_to_sample = min(len(all_files), max(1, sample_count)) + sources = random.sample(all_files, k=num_to_sample) + logger.info(f"Randomly sampled {len(sources)} files ({sample_percent}%) from {info_json_dir}") + else: + sources = all_files + except (IOError, FileNotFoundError) as e: + logger.error(f"Failed to read info_json_dir {info_json_dir}: {e}") + return 1 + + # In continuous download mode, sources are loaded inside the loop, so we skip this check. + if settings.get('directory_scan_mode') != 'continuous' and not sources: + logger.error("No sources (URLs or info.json files) to process. Exiting.") + return 1 + + # --- Group sources by profile if in download_only mode with regex --- + profile_tasks = None + task_items = sources # Default to list of sources + profile_extraction_regex = settings.get('profile_extraction_regex') + + if mode == 'download_only' and profile_extraction_regex: + logger.info(f"Grouping info.json files by profile using regex: {profile_extraction_regex}") + profile_tasks = collections.defaultdict(list) + for source_path in sources: + profile_name = get_profile_from_filename(source_path, profile_extraction_regex) + if profile_name: + profile_tasks[profile_name].append(source_path) + else: + # Assign to a default profile if no match + profile_tasks['unmatched_profile'].append(source_path) + + num_profiles = len(profile_tasks) + logger.info(f"Found {num_profiles} unique profiles. Tasks will be processed sequentially per profile.") + # The new "sources" for the purpose of task distribution are the profiles. + task_items = list(profile_tasks.items()) + + # --- Auto-calculate workers if needed --- + exec_control = policy.get('execution_control', {}) + original_workers_setting = exec_control.get('workers') + if original_workers_setting == 'auto': + if mode == 'download_only' and profile_tasks is not None: + num_profiles = len(profile_tasks) + # Use auto_workers_max from policy, with a default of 8. + max_workers = exec_control.get('auto_workers_max', 8) + num_workers = min(num_profiles, max_workers) + exec_control['workers'] = max(1, num_workers) + logger.info(f"Calculated 'auto' workers based on {num_profiles} profiles (max: {max_workers}): {exec_control['workers']}") + else: + target_rate_cfg = exec_control.get('target_rate', {}) + target_reqs = target_rate_cfg.get('requests') + target_mins = target_rate_cfg.get('per_minutes') + if target_reqs and target_mins and sources: + target_rpm = target_reqs / target_mins + num_sources = len(sources) + sleep_cfg = exec_control.get('sleep_between_tasks', {}) + avg_sleep = (sleep_cfg.get('min_seconds', 0) + sleep_cfg.get('max_seconds', 0)) / 2 + assumed_task_duration = 12 # Must match assumption in display_effective_policy + + # Formula: workers = (total_work_seconds) / (total_time_for_work) + # total_time_for_work is derived from the target rate: + # (total_cycle_time) = (60 * num_sources) / target_rpm + # total_time_for_work = total_cycle_time - avg_sleep + work_time_available = (60 * num_sources / target_rpm) - avg_sleep + + if work_time_available <= 0: + # The sleep time alone makes the target rate impossible. + # Set workers to max parallelism as a best-effort. + num_workers = num_sources + logger.warning(f"Target rate of {target_rpm} req/min is likely unachievable due to sleep time of {avg_sleep}s.") + logger.warning(f"Setting workers to max parallelism ({num_workers}) as a best effort.") + else: + total_work_seconds = num_sources * assumed_task_duration + num_workers = total_work_seconds / work_time_available + + calculated_workers = max(1, int(num_workers + 0.99)) # Ceiling + exec_control['workers'] = calculated_workers + logger.info(f"Calculated 'auto' workers based on target rate: {calculated_workers}") + else: + logger.warning("Cannot calculate 'auto' workers: 'target_rate' or sources are not defined. Defaulting to 1 worker.") + exec_control['workers'] = 1 + + display_effective_policy( + policy, + policy_name, + sources=sources, + profile_names=list(profile_tasks.keys()) if profile_tasks is not None else None, + original_workers_setting=original_workers_setting + ) + + if args.dry_run: + logger.info("Dry run complete. Exiting.") + return 0 + + start_time = time.time() + + run_until_cfg = exec_control.get('run_until', {}) + duration_seconds = (run_until_cfg.get('minutes') or 0) * 60 + max_cycles = run_until_cfg.get('cycles') or 0 + max_requests = run_until_cfg.get('requests') or 0 + + # --- Main test loop --- + cycles = 0 + try: + def process_task(source, source_index, cycle_num): + """Worker task for one source (URL or file path).""" + try: + if shutdown_event.is_set(): + return [] # Shutdown initiated, do not start new work + + # --- Step 1: Get info.json content --- + info_json_content = None + if mode in ['full_stack', 'fetch_only']: + gen_policy = policy.get('info_json_generation_policy', {}) + cmd_template = gen_policy.get('command_template') + + # --- Profile Generation --- + profile_name = None + profile_mode = settings.get('profile_mode') + pm_policy = settings.get('profile_management') + + if profile_mode == 'per_worker_with_rotation': + if not pm_policy: + logger.error("Profile mode 'per_worker_with_rotation' requires 'settings.profile_management' configuration.") + # Log a failure event and skip + event = {'type': 'fetch', 'path': str(source), 'success': False, 'error_type': 'ConfigError', 'details': 'Missing profile_management section'} + state_manager.log_event(event) + return [] + worker_id = get_worker_id() + profile_name = state_manager.get_or_rotate_worker_profile(worker_id, policy) + elif pm_policy: + # This is the existing dynamic cooldown logic + profile_name = state_manager.get_next_available_profile(policy) + if not profile_name: + logger.warning("No available profiles to run task. Skipping.") + return [] + else: + # This is the legacy logic + profile_prefix = settings.get('profile_prefix') + if profile_prefix: + if profile_mode == 'per_request': + timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f') + profile_name = f"{profile_prefix}_{timestamp}_{source_index}" + elif profile_mode == 'per_worker': + worker_index = get_worker_id() + profile_name = f"{profile_prefix}_{worker_index}" + else: # Default to pool logic + profile_pool = settings.get('profile_pool') + if profile_pool: + profile_name = f"{profile_prefix}_{source_index % profile_pool}" + else: + profile_name = "default" # A final fallback + + # --- Rate Limit Check --- + if not state_manager.check_and_update_rate_limit(profile_name, policy): + return [] # Rate limited, skip this task + + # --- Command Generation --- + gen_cmd = [] + save_dir = settings.get('save_info_json_dir') + save_path = None + + if cmd_template: + # Low-level template mode. The user is responsible for output. + video_id = get_video_id(source) + + # A heuristic to add '--' if the video ID looks like an option. + # We split the template, find the standalone '{url}' placeholder, + # and insert '--' before it. This assumes it's a positional argument. + template_parts = shlex.split(cmd_template) + try: + # Find from the end, in case it's used in an option value earlier. + url_index = len(template_parts) - 1 - template_parts[::-1].index('{url}') + if video_id.startswith('-'): + template_parts.insert(url_index, '--') + except ValueError: + # '{url}' not found as a standalone token, do nothing special. + pass + + # Rejoin and then format the whole string. + gen_cmd_str = ' '.join(template_parts) + gen_cmd_str = gen_cmd_str.format(url=video_id, profile=profile_name) + gen_cmd = shlex.split(gen_cmd_str) + if args.verbose and '--verbose' not in gen_cmd: + gen_cmd.append('--verbose') + else: + # High-level policy mode. Orchestrator builds the command. + script_cmd_str = settings.get('info_json_script') + if not script_cmd_str: + logger.error("High-level policy requires 'settings.info_json_script'.") + return [] + gen_cmd = shlex.split(script_cmd_str) + video_id = get_video_id(source) + + client_to_use, request_params = state_manager.get_client_for_request(profile_name, gen_policy) + + # --- Multi-Cookie File Logic --- + if pm_policy: + cookie_files = pm_policy.get('cookie_files') + if cookie_files and isinstance(cookie_files, list) and len(cookie_files) > 0: + profile_index = -1 + # Extract index from profile name. Matches _ or __ + match = re.search(r'_(\d+)(?:_(\d+))?$', profile_name) + if match: + # For rotation mode, the first group is worker_id. For pool mode, it's the profile index. + profile_index = int(match.group(1)) + + if profile_index != -1: + cookie_file_path = cookie_files[profile_index % len(cookie_files)] + if not request_params: + request_params = {} + request_params['cookies_file_path'] = cookie_file_path + logger.info(f"[{source}] Assigned cookie file '{os.path.basename(cookie_file_path)}' to profile '{profile_name}'") + else: + logger.warning(f"[{source}] Could not determine index for profile '{profile_name}' to assign cookie file.") + + if client_to_use: + gen_cmd.extend(['--client', str(client_to_use)]) + if gen_policy.get('auth_host'): + gen_cmd.extend(['--auth-host', str(gen_policy.get('auth_host'))]) + if gen_policy.get('auth_port'): + gen_cmd.extend(['--auth-port', str(gen_policy.get('auth_port'))]) + if profile_name != "default": + gen_cmd.extend(['--profile', profile_name]) + + # Add --print-proxy so we can track it for stats + if '--print-proxy' not in gen_cmd: + gen_cmd.append('--print-proxy') + + if request_params: + gen_cmd.extend(['--request-params-json', json.dumps(request_params)]) + if gen_policy.get('assigned_proxy_url'): + gen_cmd.extend(['--assigned-proxy-url', str(gen_policy.get('assigned_proxy_url'))]) + if gen_policy.get('proxy_rename'): + gen_cmd.extend(['--proxy-rename', str(gen_policy.get('proxy_rename'))]) + + if args.verbose: + gen_cmd.append('--verbose') + + # If saving is enabled, delegate saving to the client script. + if save_dir: + try: + os.makedirs(save_dir, exist_ok=True) + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + # Note: Using a timestamped filename to avoid race conditions. + filename = f"{timestamp}-{video_id}-{profile_name}.json" + save_path = Path(save_dir) / filename + gen_cmd.extend(['--output', str(save_path)]) + # No longer need to suppress, it's the default. + except IOError as e: + logger.error(f"[{source}] Could not prepare save path in '{save_dir}': {e}") + # Continue without saving + save_path = None + + # If not saving to a file, we need the output on stdout for the download step. + if not save_dir: + gen_cmd.append('--print-info-out') + + # The positional video_id argument must come after all options. + # Use '--' to ensure it's not parsed as an option if it starts with a dash. + if video_id.startswith('-'): + gen_cmd.append('--') + gen_cmd.append(video_id) + + worker_id = get_worker_id() + profile_log_part = f" [Profile: {profile_name}]" if profile_name else "" + logger.info(f"[Worker {worker_id}]{profile_log_part} [{source}] Running info.json command: {' '.join(shlex.quote(s) for s in gen_cmd)}") + retcode, stdout, stderr = run_command(gen_cmd) + info_json_content = stdout + + # --- Extract proxy from stderr and record it for stats --- + proxy_url = None + proxy_match = re.search(r"Proxy used: (.*)", stderr) + if proxy_match: + proxy_url = proxy_match.group(1).strip() + state_manager.record_proxy_usage(proxy_url) + + if retcode == 0: + # If the client script saved the file, stdout will be empty. + # If we need the content for a download step, we must read it back. + if not info_json_content.strip(): + # Check stderr for the success message to confirm save. + saved_path_match = re.search(r"Successfully saved info.json to (.*)", stderr) + if saved_path_match: + output_file_str = saved_path_match.group(1).strip().strip("'\"") + logger.info(f"[{source}] -> {saved_path_match.group(0).strip()}") + + # If this is a full_stack test, we need the content for the download worker. + if mode == 'full_stack': + try: + with open(output_file_str, 'r', encoding='utf-8') as f: + info_json_content = f.read() + except IOError as e: + logger.error(f"Could not read back info.json from '{output_file_str}': {e}") + retcode = -1 # Treat as failure + elif save_path: + # Command was told to save, but didn't confirm. Assume it worked if exit code is 0. + logger.info(f"[{source}] -> Client script exited 0, assuming info.json was saved to '{save_path}'") + if mode == 'full_stack': + try: + with open(save_path, 'r', encoding='utf-8') as f: + info_json_content = f.read() + except IOError as e: + logger.error(f"Could not read back info.json from '{save_path}': {e}") + retcode = -1 + # If stdout is empty and we weren't saving, it's an issue. + elif not save_path and not cmd_template: + logger.error(f"[{source}] info.json generation gave no stdout and was not asked to save to a file.") + retcode = -1 + else: + logger.info(f"[{source}] -> Successfully fetched info.json to memory/stdout.") + + event = {'type': 'fetch', 'path': str(source), 'profile': profile_name} + if proxy_url: + event['proxy_url'] = proxy_url + + if retcode != 0: + error_lines = [line for line in stderr.strip().split('\n') if 'error' in line.lower()] + error_msg = error_lines[-1] if error_lines else stderr.strip().split('\n')[-1] + logger.error(f"[{source}] Failed to generate info.json: {error_msg}") + event.update({'success': False, 'error_type': 'GetInfoJsonFail', 'details': error_msg}) + state_manager.log_event(event) + return [] + + # Check for quality degradation before logging success + s_conditions = policy.get('stop_conditions', {}) + quality_policy = s_conditions.get('on_quality_degradation') + if quality_policy and info_json_content: + try: + info_data = json.loads(info_json_content) + available_formats = {f.get('format_id') for f in info_data.get('formats', [])} + + required_formats = quality_policy.get('trigger_if_missing_formats') + if required_formats: + # Can be a single string, a comma-separated string, or a list of strings. + if isinstance(required_formats, str): + required_formats = [f.strip() for f in required_formats.split(',')] + + missing_formats = [f for f in required_formats if f not in available_formats] + + if missing_formats: + logger.warning(f"[{source}] Quality degradation detected. Missing required formats: {', '.join(missing_formats)}.") + event['quality_degradation_trigger'] = True + event['missing_formats'] = missing_formats + except (json.JSONDecodeError, TypeError): + logger.warning(f"[{source}] Could not parse info.json or find formats to check for quality degradation.") + + # Record request for profile cooldown policy if active + if pm_policy: + state_manager.record_profile_request(profile_name) + + state_manager.increment_request_count() + event.update({'success': True, 'details': 'OK'}) + state_manager.log_event(event) + + # Saving is now delegated to the client script when a save_dir is provided. + # The orchestrator no longer saves the file itself. + + elif mode == 'download_only': + # This path is for non-profile-grouped download_only mode. + try: + with open(source, 'r', encoding='utf-8') as f: + info_json_content = f.read() + except (IOError, FileNotFoundError) as e: + logger.error(f"[{get_display_name(source)}] Could not read info.json file: {e}") + return [] + + if mode != 'fetch_only': + return _run_download_logic(source, info_json_content, policy, state_manager, profile_name=profile_name) + + return [] + finally: + # Sleep after the task is completed to space out requests from this worker. + exec_control = policy.get('execution_control', {}) + sleep_cfg = exec_control.get('sleep_between_tasks', {}) + sleep_min = sleep_cfg.get('min_seconds', 0) + + if sleep_min > 0: + sleep_max = sleep_cfg.get('max_seconds') or sleep_min + if sleep_max > sleep_min: + sleep_duration = random.uniform(sleep_min, sleep_max) + else: + sleep_duration = sleep_min + + logger.debug(f"Worker sleeping for {sleep_duration:.2f}s after task for {get_display_name(source)}.") + # Interruptible sleep + sleep_end_time = time.time() + sleep_duration + while time.time() < sleep_end_time: + if shutdown_event.is_set(): + break + time.sleep(0.2) + + while not shutdown_event.is_set(): + if duration_seconds and (time.time() - start_time) > duration_seconds: + logger.info("Reached duration limit. Stopping.") + break + if max_requests > 0 and state_manager.get_request_count() >= max_requests: + logger.info(f"Reached max requests ({max_requests}). Stopping.") + break + + # --- Rescan for sources if in continuous download mode --- + if mode == 'download_only' and settings.get('directory_scan_mode') == 'continuous': + info_json_dir = settings.get('info_json_dir') + try: + all_files_in_dir = Path(info_json_dir).glob('*.json') + processed_files = state_manager.get_processed_files() + + new_files = [f for f in all_files_in_dir if str(f) not in processed_files] + + # Sort by modification time, oldest first, to process in order of creation + new_files.sort(key=os.path.getmtime) + + max_files_per_cycle = settings.get('max_files_per_cycle') + if max_files_per_cycle and len(new_files) > max_files_per_cycle: + sources = new_files[:max_files_per_cycle] + else: + sources = new_files + + if not sources: + sleep_duration = settings.get('sleep_if_no_new_files_seconds', 10) + logger.info(f"No new info.json files found in '{info_json_dir}'. Sleeping for {sleep_duration}s...") + + # Interruptible sleep + sleep_end_time = time.time() + sleep_duration + while time.time() < sleep_end_time: + if shutdown_event.is_set(): + break + time.sleep(0.5) + + if shutdown_event.is_set(): + break + continue # Skip to next iteration of the while loop + + except (IOError, FileNotFoundError) as e: + logger.error(f"Failed to read info_json_dir {info_json_dir}: {e}. Retrying in 10s.") + time.sleep(10) + continue + + cycles += 1 + if max_cycles > 0 and cycles > max_cycles: + logger.info(f"Reached max cycles ({max_cycles}). Stopping.") + break + + logger.info(f"--- Cycle #{cycles} (Total Requests: {state_manager.get_request_count()}) ---") + + with concurrent.futures.ThreadPoolExecutor(max_workers=exec_control.get('workers', 1)) as executor: + if mode == 'download_only' and profile_tasks is not None: + # New: submit profile tasks + future_to_source = { + executor.submit(process_profile_task, profile_name, file_list, policy, state_manager, cycles): profile_name + for profile_name, file_list in task_items + } + else: + # Old: submit individual file/url tasks + future_to_source = { + executor.submit(process_task, source, i, cycles): source + for i, source in enumerate(task_items) + } + + should_stop = False + pending_futures = set(future_to_source.keys()) + + while pending_futures and not should_stop: + done, pending_futures = concurrent.futures.wait( + pending_futures, return_when=concurrent.futures.FIRST_COMPLETED + ) + + for future in done: + if shutdown_event.is_set(): + should_stop = True + break + + source = future_to_source[future] + try: + results = future.result() + + # Mark file as processed in continuous download mode + if mode == 'download_only' and settings.get('directory_scan_mode') == 'continuous': + state_manager.mark_file_as_processed(source) + + if settings.get('mark_processed_files'): + try: + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + new_path = source.parent / f"{source.name}.{timestamp}.processed" + source.rename(new_path) + logger.info(f"Marked '{source.name}' as processed by renaming to '{new_path.name}'") + except (IOError, OSError) as e: + logger.error(f"Failed to rename processed file '{source.name}': {e}") + + for result in results: + if not result['success']: + s_conditions = policy.get('stop_conditions', {}) + is_cumulative_403_active = s_conditions.get('on_cumulative_403', {}).get('max_errors') + if s_conditions.get('on_failure') or \ + (s_conditions.get('on_http_403') and not is_cumulative_403_active and result['error_type'] == 'HTTP 403') or \ + (s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'): + logger.info(f"!!! STOP CONDITION MET: Immediate stop on failure '{result['error_type']}' for {get_display_name(source)}. Shutting down all workers. !!!") + should_stop = True + break + except concurrent.futures.CancelledError: + logger.info(f"Task for {get_display_name(source)} was cancelled during shutdown.") + event = { + 'type': 'fetch' if mode != 'download_only' else 'download', + 'path': str(source), + 'success': False, + 'error_type': 'Cancelled', + 'details': 'Task cancelled during shutdown.' + } + state_manager.log_event(event) + except Exception as exc: + logger.error(f'{get_display_name(source)} generated an exception: {exc}') + + if should_stop: + break + + # Check for cumulative error rate stop conditions + s_conditions = policy.get('stop_conditions', {}) + error_rate_policy = s_conditions.get('on_error_rate') + if error_rate_policy and not should_stop: + max_errors = error_rate_policy.get('max_errors') + per_minutes = error_rate_policy.get('per_minutes') + if max_errors and per_minutes: + error_count = state_manager.check_cumulative_error_rate(max_errors, per_minutes) + if error_count > 0: + logger.info(f"!!! STOP CONDITION MET: Error rate exceeded: {error_count} errors in the last {per_minutes} minute(s). Shutting down. !!!") + should_stop = True + + cumulative_403_policy = s_conditions.get('on_cumulative_403') + if cumulative_403_policy and not should_stop: + max_errors = cumulative_403_policy.get('max_errors') + per_minutes = cumulative_403_policy.get('per_minutes') + if max_errors and per_minutes: + error_count = state_manager.check_cumulative_error_rate(max_errors, per_minutes, error_type='HTTP 403') + if error_count > 0: + logger.info(f"!!! STOP CONDITION MET: Cumulative 403 error rate exceeded: {error_count} errors in the last {per_minutes} minute(s). Shutting down. !!!") + should_stop = True + + quality_degradation_policy = s_conditions.get('on_quality_degradation') + if quality_degradation_policy and not should_stop: + max_triggers = quality_degradation_policy.get('max_triggers') + per_minutes = quality_degradation_policy.get('per_minutes') + if max_triggers and per_minutes: + trigger_count = state_manager.check_quality_degradation_rate(max_triggers, per_minutes) + if trigger_count > 0: + logger.info(f"!!! STOP CONDITION MET: Quality degradation triggered {trigger_count} times in the last {per_minutes} minute(s). Shutting down. !!!") + should_stop = True + + if should_stop: + break + + # Check for duration limit after each task completes + if duration_seconds and (time.time() - start_time) > duration_seconds: + logger.info("Reached duration limit. Cancelling remaining tasks.") + should_stop = True + + if should_stop and pending_futures: + logger.info(f"Cancelling {len(pending_futures)} outstanding task(s).") + for future in pending_futures: + future.cancel() + + if should_stop: break + + if max_cycles > 0 and cycles >= max_cycles: + break + + logger.info("Cycle complete.") + + except KeyboardInterrupt: + logger.info("\nForceful shutdown requested...") + finally: + state_manager.print_summary(policy) + state_manager.close() + + return 0