Updates on minio, envoy ports on master, adding ytops_client

This commit is contained in:
aperez 2025-11-19 10:37:09 +03:00
parent f151ffee86
commit 0ead029b85
24 changed files with 5868 additions and 552 deletions

View File

@ -105,7 +105,9 @@ RUN mv /usr/local/bin/pip /usr/local/bin/pip.orig && \
"gunicorn==20.1.0" \
"python-ffmpeg==2.0.12" \
"ffprobe3" \
"python-dotenv" && \
"python-dotenv" \
"PyYAML" \
"aria2p" && \
mv /usr/local/bin/pip.orig /usr/local/bin/pip
# --- Install the custom yt_ops_services package ---
@ -117,6 +119,12 @@ COPY --chown=airflow:airflow yt_ops_services ./yt_ops_services/
COPY --chown=airflow:airflow thrift_model ./thrift_model/
COPY --chown=airflow:airflow pangramia ./pangramia/
# Copy the ytops-client tool and its executable
COPY --chown=airflow:airflow ytops_client ./ytops_client/
COPY --chown=airflow:airflow bin/ytops-client /app/bin/ytops-client
RUN chmod +x /app/bin/ytops-client
ENV PATH="/app/bin:${PATH}"
# Install the package in editable mode. This runs setup.py and installs all dependencies
# listed in `install_requires`, making the `yt_ops_services` module available everywhere.
# Bypass the pip root check again.

View File

@ -118,14 +118,14 @@ services:
- "{{ service_role }}"
# --- S3 Logging Parameters ---
- "--s3-endpoint-url"
- "${S3_ENDPOINT_URL}"
- "--s3-access-key-id"
- "${S3_ACCESS_KEY_ID}"
- "--s3-secret-access-key"
- "${S3_SECRET_ACCESS_KEY}"
- "--s3-region-name"
- "${S3_REGION_NAME}"
#- "--s3-endpoint-url"
#- "${S3_ENDPOINT_URL}"
#- "--s3-access-key-id"
#- "${S3_ACCESS_KEY_ID}"
#- "--s3-secret-access-key"
#- "${S3_SECRET_ACCESS_KEY}"
#- "--s3-region-name"
#- "${S3_REGION_NAME}"
{% if service_role is defined and service_role != 'management' %}
# --- Parameters for worker/all-in-one roles ONLY ---
- "--script-dir"

View File

@ -4,11 +4,11 @@ events {
http {
upstream minio_servers {
server minio:9000;
server 172.17.0.1:9001;
}
upstream minio_console_servers {
server minio:9001;
server 172.17.0.1:9002;
}
server {

View File

@ -45,7 +45,7 @@ except ImportError as e:
raise
DEFAULT_MANAGEMENT_SERVICE_IP = Variable.get("MANAGEMENT_SERVICE_HOST", default_var="envoy-thrift-lb")
DEFAULT_MANAGEMENT_SERVICE_PORT = Variable.get("MANAGEMENT_SERVICE_PORT", default_var=9080)
DEFAULT_MANAGEMENT_SERVICE_PORT = Variable.get("MANAGEMENT_SERVICE_PORT", default_var=9980)
DEFAULT_REDIS_CONN_ID = "redis_default"
# Version tracking for debugging

View File

@ -55,9 +55,13 @@ def _get_predefined_url_lists():
'urls.dh128.json',
'urls.rt100.json',
'urls.rt25.json',
'urls.rt250.json',
'urls.rt500.json',
'urls.rt3000.json',
'urls.sky28.json',
'urls.sky3.json',
'urls.tq46.json',
'urls.topnews500.json',
]
return ['None'] + sorted(predefined_files)
@ -256,15 +260,15 @@ def clear_queue_callable(**context):
redis_conn_id = params['redis_conn_id']
queue_system = params.get('queue_system', 'v1_monolithic')
queue_base_names_to_clear = []
if queue_system == 'v1_monolithic':
queue_base_name = params['queue_base_name']
elif queue_system == 'v2_separated_auth':
queue_base_name = 'queue2_auth'
elif queue_system == 'v2_separated_dl':
queue_base_name = 'queue2_dl'
queue_base_names_to_clear.append(params['queue_base_name'])
elif queue_system.startswith('v2_'):
# For v2, clear both auth and dl queues for a complete clear.
queue_base_names_to_clear.extend(['queue2_auth', 'queue2_dl'])
else:
raise ValueError(f"Invalid queue_system: {queue_system}")
logger.info(f"Operating on queue system '{queue_system}' with base name '{queue_base_name}'.")
logger.info(f"Operating on queue system '{queue_system}' with base names: {queue_base_names_to_clear}.")
queues_to_clear_options = params.get('queues_to_clear_options', [])
confirm_clear = params.get('confirm_clear', False)
@ -290,14 +294,15 @@ def clear_queue_callable(**context):
all_suffixes = ['_inbox', '_fail', '_result', '_progress']
keys_to_delete = set()
if '_all' in queues_to_clear_options:
logger.info("'_all' option selected. Clearing all standard queues.")
for suffix in all_suffixes:
keys_to_delete.add(f"{queue_base_name}{suffix}")
else:
for suffix in queues_to_clear_options:
if suffix in all_suffixes:
for queue_base_name in queue_base_names_to_clear:
if '_all' in queues_to_clear_options:
logger.info(f"'_all' option selected. Clearing all standard queues for base '{queue_base_name}'.")
for suffix in all_suffixes:
keys_to_delete.add(f"{queue_base_name}{suffix}")
else:
for suffix in queues_to_clear_options:
if suffix in all_suffixes:
keys_to_delete.add(f"{queue_base_name}{suffix}")
if not keys_to_delete:
logger.warning("No valid queue suffixes were selected. Nothing to delete.")

View File

@ -37,7 +37,7 @@ logger = logging.getLogger(__name__)
# Default settings from Airflow Variables or hardcoded fallbacks
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9980)
DEFAULT_ARGS = {
'owner': 'airflow',

View File

@ -75,10 +75,10 @@ DEFAULT_REQUEST_PARAMS_JSON = """{
# Default settings
DEFAULT_QUEUE_NAME = 'video_queue'
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_TOTAL_WORKERS = 3
DEFAULT_TOTAL_WORKERS = 8
DEFAULT_WORKERS_PER_BUNCH = 1
DEFAULT_WORKER_DELAY_S = 5
DEFAULT_BUNCH_DELAY_S = 20
DEFAULT_WORKER_DELAY_S = 1
DEFAULT_BUNCH_DELAY_S = 1
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
@ -323,7 +323,7 @@ with DAG(
# --- Worker Passthrough Parameters ---
'on_auth_failure': Param(
'retry_with_new_account',
'proceed_loop_under_manual_inspection',
type="string",
enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'proceed_loop_under_manual_inspection'],
title="[Worker Param] On Authentication Failure Policy",
@ -343,38 +343,17 @@ with DAG(
"'proceed_loop': (Default) Mark URL as failed but continue the processing loop with a new URL. "
"'retry_with_new_token': Attempt to get a new token with a new account and retry the download once. If it fails again, proceed loop."
),
'request_params_json': Param(DEFAULT_REQUEST_PARAMS_JSON, type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service.", render_kwargs={"rows": 20, "cols": 120}),
'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."),
'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."),
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
'clients': Param(
'mweb,web_camoufox,tv',
'tv_simply',
type="string",
enum=[
'mweb,web_camoufox,tv',
'tv_simply',
'mweb',
'web_camoufox',
'tv',
'custom',
'tv,web_safari,mweb,web_camoufox',
'web_safari',
'web',
'web_embedded',
'web_music',
'web_creator',
'web_safari_camoufox',
'web_embedded_camoufox',
'web_music_camoufox',
'web_creator_camoufox',
'mweb_camoufox',
'android',
'android_music',
'android_creator',
'android_vr',
'ios',
'ios_music',
'ios_creator',
'tv_simply',
'tv_embedded',
],
title="[Worker Param] Clients",
description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details."
@ -402,27 +381,24 @@ with DAG(
type="string",
enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'],
title="[Worker Param] Download Format Preset",
description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformats_0: 18,140\nformats_2: 18,140,299/298/137/136/135/134/133\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformats_0: 18,140\nformats_2: 18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
),
'download_format_custom': Param(
'18,140,299/298/137/136/135/134/133',
'18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy',
type="string",
title="[Worker Param] Custom Download Format",
description="Custom yt-dlp format string. Used when preset is 'custom'. E.g., 'ba[ext=m4a]/bestaudio/best'."
),
'downloader': Param(
'default',
'cli',
type="string",
enum=['default', 'aria2c'],
title="[Worker Param] Downloader",
description="Choose the downloader for yt-dlp."
),
'downloader_args_aria2c': Param(
'aria2c:-x 4 -k 2M --max-download-limit=3M',
type="string",
title="[Worker Param] Aria2c Downloader Arguments",
description="Arguments to pass to yt-dlp's --downloader-args. Used when downloader is 'aria2c'."
enum=['py', 'aria-rpc', 'cli'],
title="[Worker Param] Download Tool",
description="Choose the download tool to use: 'py' (native python, recommended), 'aria-rpc' (send to aria2c daemon), 'cli' (legacy yt-dlp wrapper)."
),
'aria_host': Param('172.17.0.1', type="string", title="[Worker Param] Aria2c Host", description="For 'aria-rpc' downloader: Host of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_HOST'."),
'aria_port': Param(6800, type="integer", title="[Worker Param] Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_PORT'."),
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="[Worker Param] Aria2c Secret", description="For 'aria-rpc' downloader: Secret token. Can be set via Airflow Variable 'YTDLP_ARIA_SECRET'."),
'yt_dlp_extra_args': Param(
'--restrict-filenames',
type=["string", "null"],

View File

@ -290,7 +290,10 @@ def get_url_and_assign_account(**context):
@task
def get_token(initial_data: dict, **context):
"""Makes a single attempt to get a token from the Thrift service."""
"""Makes a single attempt to get a token by calling the ytops-client get-info tool."""
import subprocess
import shlex
ti = context['task_instance']
params = context['params']
@ -298,129 +301,85 @@ def get_token(initial_data: dict, **context):
url = initial_data['url_to_process']
info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')
host, port, timeout = params['service_ip'], int(params['service_port']), int(params.get('timeout', DEFAULT_TIMEOUT))
host, port = params['service_ip'], int(params['service_port'])
machine_id = params.get('machine_id') or socket.gethostname()
clients = params.get('clients')
request_params_json = params.get('request_params_json', '{}')
assigned_proxy_url = params.get('assigned_proxy_url')
# Pretty-print the request parameters for debugging
try:
pretty_request_params = json.dumps(json.loads(request_params_json), indent=2)
logger.info(f"\n--- Request Parameters ---\n{pretty_request_params}\n--- End of Request Parameters ---")
except (json.JSONDecodeError, TypeError):
logger.warning("Could not parse request_params_json. Using raw content.")
logger.info(f"\n--- Raw Request Parameters ---\n{request_params_json}\n--- End of Raw Request Parameters ---")
video_id = _extract_video_id(url)
os.makedirs(info_json_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
info_json_path = os.path.join(info_json_dir, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json")
# Construct Airflow log context to pass to the service
try:
from airflow.configuration import conf
remote_base = conf.get('logging', 'remote_base_log_folder')
log_path = (
f"{remote_base}/dag_id={ti.dag_id}/run_id={ti.run_id}/"
f"task_id={ti.task_id}/attempt={ti.try_number}.log"
)
airflow_log_context = AirflowLogContext(
logS3Path=log_path,
dagId=ti.dag_id,
runId=ti.run_id,
taskId=ti.task_id,
tryNumber=ti.try_number,
workerHostname=socket.gethostname(),
queue=ti.queue
)
logger.info(f"Constructed Airflow log context for yt-ops service: {airflow_log_context}")
except Exception as e:
logger.warning(f"Could not construct full Airflow log context: {e}. Creating a basic one.")
airflow_log_context = AirflowLogContext(
dagId=ti.dag_id,
runId=ti.run_id,
taskId=ti.task_id,
tryNumber=ti.try_number,
workerHostname=socket.gethostname(),
queue=ti.queue
)
cmd = [
'ytops-client', 'get-info',
'--host', host,
'--port', str(port),
'--profile', account_id,
'--output', info_json_path,
'--print-proxy',
'--verbose',
'--log-return',
]
if clients:
cmd.extend(['--client', clients])
if machine_id:
cmd.extend(['--machine-id', machine_id])
if request_params_json and request_params_json != '{}':
cmd.extend(['--request-params-json', request_params_json])
if assigned_proxy_url:
cmd.extend(['--assigned-proxy-url', assigned_proxy_url])
cmd.append(url)
logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' (Clients: {clients}) ---")
client, transport = None, None
try:
client, transport = _get_thrift_client(host, port, timeout)
token_data = client.getOrRefreshToken(
accountId=account_id,
updateType=TokenUpdateMode.AUTO,
url=url,
clients=clients,
machineId=machine_id,
airflowLogContext=airflow_log_context,
requestParamsJson=request_params_json,
assignedProxyUrl=assigned_proxy_url
)
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
logger.info(f"Executing command: {copy_paste_cmd}")
# Log a compact summary of the Thrift response, omitting large/detailed fields.
summary_token_data = copy(token_data)
if hasattr(summary_token_data, 'infoJson') and summary_token_data.infoJson:
summary_token_data.infoJson = f"... ({len(summary_token_data.infoJson)} bytes) ..."
if hasattr(summary_token_data, 'cookiesBlob') and summary_token_data.cookiesBlob:
summary_token_data.cookiesBlob = f"... ({len(summary_token_data.cookiesBlob)} bytes) ..."
# These will be logged separately below.
if hasattr(summary_token_data, 'requestSummary'):
summary_token_data.requestSummary = "..."
if hasattr(summary_token_data, 'communicationLogs'):
summary_token_data.communicationLogs = "..."
logger.info(f"Thrift service response summary: {summary_token_data}")
process = subprocess.run(cmd, capture_output=True, text=True, timeout=int(params.get('timeout', DEFAULT_TIMEOUT)))
request_summary = getattr(token_data, 'requestSummary', None)
if request_summary:
# Prepending a newline for better separation in logs.
logger.info(f"\n--- Request Summary ---\n{request_summary}")
if process.stdout:
logger.info(f"ytops-client STDOUT:\n{process.stdout}")
if process.stderr:
logger.info(f"ytops-client STDERR:\n{process.stderr}")
communication_logs = getattr(token_data, 'communicationLogs', None)
if communication_logs:
logger.info("--- Communication Logs from Token Service ---")
logger.info(communication_logs)
logger.info("--- End of Communication Logs ---")
info_json = getattr(token_data, 'infoJson', None)
if not (info_json and json.loads(info_json)):
raise AirflowException("Service returned success but info.json was empty or invalid.")
video_id = _extract_video_id(url)
os.makedirs(info_json_dir, exist_ok=True)
# Use a readable timestamp for a unique filename on each attempt.
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
info_json_path = os.path.join(info_json_dir, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json")
with open(info_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
ytdlp_command = getattr(token_data, 'ytdlpCommand', None)
if ytdlp_command:
logger.info(f"--- YTDLP Command from Token Service ---\n{ytdlp_command}\n--- End of YTDLP Command ---")
return {
'info_json_path': info_json_path,
'socks_proxy': getattr(token_data, proxy_attr) if proxy_attr else None,
'ytdlp_command': ytdlp_command,
'successful_account_id': account_id,
'original_url': url, # Include original URL for fallback
}
except (PBServiceException, PBUserException, TTransportException) as e:
error_context = getattr(e, 'context', None)
if isinstance(error_context, str):
try: error_context = json.loads(error_context.replace("'", "\""))
except: pass
if process.returncode != 0:
error_message = "ytops-client failed. See logs for details."
for line in reversed(process.stderr.strip().split('\n')):
if 'ERROR' in line or 'Thrift error' in line or 'Connection to server failed' in line:
error_message = line.strip()
break
error_code = 'GET_INFO_CLIENT_FAIL'
if "BOT_DETECTED" in process.stderr:
error_code = "BOT_DETECTED"
elif "BOT_DETECTION_SIGN_IN_REQUIRED" in process.stderr:
error_code = "BOT_DETECTION_SIGN_IN_REQUIRED"
elif "Connection to server failed" in process.stderr:
error_code = "TRANSPORT_ERROR"
error_details = {
'error_message': getattr(e, 'message', str(e)),
'error_code': getattr(e, 'errorCode', 'TRANSPORT_ERROR'),
'proxy_url': error_context.get('proxy_url') if isinstance(error_context, dict) else None
'error_message': error_message,
'error_code': error_code,
'proxy_url': None
}
logger.error(f"Thrift call failed for account '{account_id}'. Details: {error_details}")
ti.xcom_push(key='error_details', value=error_details)
raise AirflowException(f"Thrift call failed: {error_details['error_message']}")
finally:
if transport and transport.isOpen():
transport.close()
raise AirflowException(f"ytops-client get-info failed: {error_message}")
proxy = None
proxy_match = re.search(r"Proxy used: (.*)", process.stderr)
if proxy_match:
proxy = proxy_match.group(1).strip()
return {
'info_json_path': info_json_path,
'socks_proxy': proxy,
'ytdlp_command': None,
'successful_account_id': account_id,
'original_url': url,
}
@task.branch
def handle_bannable_error_branch(task_id_to_check: str, **context):
@ -706,7 +665,7 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
elif format_preset == 'formats_0':
download_format = '18,140'
elif format_preset == 'formats_2':
download_format = '18,140,299/298/137/136/135/134/133'
download_format = '18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy'
elif format_preset == 'formats_3':
download_format = '18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318'
else:
@ -720,112 +679,102 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
raise AirflowException(f"Error: info.json path is missing or file does not exist ({info_json_path}).")
def run_yt_dlp_command(format_selector: str):
"""Constructs and runs a yt-dlp command, returning a list of final filenames."""
cmd = [
'yt-dlp', '--verbose', '--print-traffic', '--load-info-json', info_json_path,
'-f', format_selector, '-o', full_output_path,
'--print', 'filename', '--continue', '--no-progress', '--no-simulate',
'--no-write-info-json', '--ignore-errors', '--no-playlist',
]
if params.get('fragment_retries'):
cmd.extend(['--fragment-retries', str(params['fragment_retries'])])
if params.get('limit_rate'):
cmd.extend(['--limit-rate', params['limit_rate']])
if params.get('socket_timeout'):
cmd.extend(['--socket-timeout', str(params['socket_timeout'])])
if params.get('min_sleep_interval'):
cmd.extend(['--min-sleep-interval', str(params['min_sleep_interval'])])
if params.get('max_sleep_interval'):
cmd.extend(['--max-sleep-interval', str(params['max_sleep_interval'])])
if params.get('yt_dlp_test_mode'):
cmd.append('--test')
"""Constructs and runs a yt-ops-client download command, returning a list of final filenames."""
downloader = params.get('downloader', 'py')
cmd = ['ytops-client', 'download', downloader, '--load-info-json', info_json_path, '-f', format_selector]
downloader = params.get('downloader', 'default')
if proxy and not (downloader == 'aria2c' and proxy.startswith('socks5://')):
if proxy:
cmd.extend(['--proxy', proxy])
gost_process = None
try:
if downloader == 'aria2c':
cmd.extend(['--downloader', 'aria2c'])
downloader_args = params.get('downloader_args_aria2c')
if proxy and proxy.startswith('socks5://'):
import socket
from contextlib import closing
def find_free_port():
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
return s.getsockname()[1]
local_port = find_free_port()
http_proxy = f"http://127.0.0.1:{local_port}"
logger.info(f"Starting gost for format '{format_selector}' to forward {proxy} to {http_proxy}")
gost_cmd = ['gost', '-L', f'http://127.0.0.1:{local_port}', '-F', proxy]
gost_process = subprocess.Popen(gost_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
time.sleep(1)
if gost_process.poll() is not None:
stdout, stderr = gost_process.communicate()
logger.error(f"gost failed to start. Exit: {gost_process.returncode}. Stdout: {stdout.decode()}. Stderr: {stderr.decode()}")
raise AirflowException("gost proxy tunnel failed to start.")
user_args = downloader_args[len('aria2c:'):] if downloader_args and downloader_args.startswith('aria2c:') else (downloader_args or "")
final_args_str = f'aria2c:{user_args.strip()} --http-proxy={http_proxy}'
cmd.extend(['--downloader-args', final_args_str])
elif downloader_args:
cmd.extend(['--downloader-args', downloader_args])
if downloader == 'py':
cmd.extend(['--output-dir', download_dir])
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
py_extra_args = []
if params.get('fragment_retries'):
py_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])])
if params.get('limit_rate'):
py_extra_args.extend(['--limit-rate', params['limit_rate']])
if params.get('socket_timeout'):
py_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
if params.get('min_sleep_interval'):
py_extra_args.extend(['--sleep-interval', str(params['min_sleep_interval'])])
if params.get('max_sleep_interval'):
py_extra_args.extend(['--max-sleep-interval', str(params['max_sleep_interval'])])
if params.get('yt_dlp_test_mode'):
py_extra_args.append('--test')
extra_args = params.get('yt_dlp_extra_args')
if extra_args:
cmd.extend(shlex.split(extra_args))
if original_url:
cmd.append(original_url)
existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '')
final_extra_args = existing_extra + py_extra_args
if final_extra_args:
cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)])
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
logger.info(f"Executing yt-dlp command for format '{format_selector}': {copy_paste_cmd}")
process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
if process.stdout:
logger.info(f"yt-dlp STDOUT for format '{format_selector}':\n{process.stdout}")
if process.stderr:
# yt-dlp often prints progress and informational messages to stderr
logger.info(f"yt-dlp STDERR for format '{format_selector}':\n{process.stderr}")
elif downloader == 'aria-rpc':
cmd.extend([
'--aria-host', params.get('aria_host', '172.17.0.1'),
'--aria-port', str(params.get('aria_port', 6800)),
'--aria-secret', params.get('aria_secret'),
'--wait', '--auto-merge-fragments',
'--fragments-dir', download_dir,
'--output-dir', download_dir,
])
if params.get('yt_dlp_cleanup_mode'):
cmd.append('--cleanup')
if process.returncode != 0:
logger.error(f"yt-dlp failed for format '{format_selector}' with exit code {process.returncode}")
# STDOUT and STDERR are already logged above.
raise AirflowException(f"yt-dlp command failed for format '{format_selector}'. {process.stderr}")
elif downloader == 'cli':
cmd.extend(['--output-dir', download_dir])
# The 'cli' tool is the old yt-dlp wrapper, so it takes similar arguments.
cli_extra_args = []
if params.get('fragment_retries'):
cli_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])])
if params.get('limit_rate'):
cli_extra_args.extend(['--limit-rate', params['limit_rate']])
if params.get('socket_timeout'):
cli_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
if params.get('min_sleep_interval'):
cli_extra_args.extend(['--sleep-interval', str(params['min_sleep_interval'])])
if params.get('max_sleep_interval'):
cli_extra_args.extend(['--max-sleep-interval', str(params['max_sleep_interval'])])
if params.get('yt_dlp_test_mode'):
cli_extra_args.append('--test')
# In test mode, files are not created, so we only check that yt-dlp returned filenames.
# Otherwise, we verify that the files actually exist on disk.
output_files = [f for f in process.stdout.strip().split('\n') if f]
if not params.get('yt_dlp_test_mode'):
output_files = [f for f in output_files if os.path.exists(f)]
existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '')
final_extra_args = existing_extra + cli_extra_args
if final_extra_args:
cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)])
if not output_files:
log_msg = (f"Test run for format '{format_selector}' did not produce any filenames."
if params.get('yt_dlp_test_mode') else
f"Download for format '{format_selector}' finished but no output files exist.")
exc_msg = (f"Test run for format '{format_selector}' did not produce any filenames."
if params.get('yt_dlp_test_mode') else
f"Download for format '{format_selector}' did not produce a file.")
logger.error(log_msg)
logger.error(f"Full STDOUT:\n{process.stdout}")
logger.error(f"Full STDERR:\n{process.stderr}")
raise AirflowException(exc_msg)
log_prefix = "SUCCESS (Test Mode):" if params.get('yt_dlp_test_mode') else "SUCCESS:"
logger.info(f"{log_prefix} Command for format '{format_selector}' complete. Files: {output_files}")
return output_files
finally:
if gost_process:
logger.info(f"Terminating gost process (PID: {gost_process.pid}) for format '{format_selector}'.")
gost_process.terminate()
try:
gost_process.wait(timeout=5)
except subprocess.TimeoutExpired:
gost_process.kill()
gost_process.wait()
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
logger.info(f"Executing download command for format '{format_selector}': {copy_paste_cmd}")
process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
if process.stdout:
logger.info(f"Download tool STDOUT for format '{format_selector}':\n{process.stdout}")
if process.stderr:
logger.info(f"Download tool STDERR for format '{format_selector}':\n{process.stderr}")
if process.returncode != 0:
logger.error(f"Download tool failed for format '{format_selector}' with exit code {process.returncode}")
raise AirflowException(f"Download command failed for format '{format_selector}'. See logs for details.")
output_files = []
for line in process.stdout.strip().split('\n'):
# For aria-rpc, parse "Download and merge successful: <path>" or "Download successful: <path>"
match = re.search(r'successful: (.+)', line)
if match:
filepath = match.group(1).strip()
if os.path.exists(filepath):
output_files.append(filepath)
else:
logger.warning(f"File path from aria-rpc output does not exist locally: '{filepath}'")
# For py/cli, it's just the path
elif os.path.exists(line.strip()):
output_files.append(line.strip())
if not params.get('yt_dlp_test_mode') and not output_files:
raise AirflowException(f"Download for format '{format_selector}' finished but no output files were found or exist.")
log_prefix = "SUCCESS (Test Mode):" if params.get('yt_dlp_test_mode') else "SUCCESS:"
logger.info(f"{log_prefix} Command for format '{format_selector}' complete. Files: {output_files}")
return output_files
def run_ffmpeg_probe(filename):
"""Probes a file with ffmpeg to check for corruption."""
@ -1512,7 +1461,7 @@ with DAG(
'prepend_client_to_account': Param(True, type="boolean", title="[Worker Param] Prepend Client to Account", description="If True, prepends client and timestamp to account names in prefix mode."),
'machine_id': Param(None, type=["string", "null"]),
'assigned_proxy_url': Param(None, type=["string", "null"], title="[Worker Param] Assigned Proxy URL", description="A specific proxy URL to use for the request, overriding the server's proxy pool logic."),
'clients': Param('mweb,web_camoufox,tv', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"),
'clients': Param('tv_simply', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"),
'timeout': Param(DEFAULT_TIMEOUT, type="integer"),
'output_path_template': Param("%(title)s [%(id)s].f%(format_id)s.%(ext)s", type="string", title="[Worker Param] Output Path Template", description="Output filename template for yt-dlp. It is highly recommended to include `%(format_id)s` to prevent filename collisions when downloading multiple formats."),
'on_auth_failure': Param(
@ -1542,11 +1491,11 @@ with DAG(
'min_sleep_interval': Param(5, type="integer", title="[Worker Param] Min Sleep Interval", description="Minimum time to sleep between downloads (seconds)."),
'max_sleep_interval': Param(10, type="integer", title="[Worker Param] Max Sleep Interval", description="Maximum time to sleep between downloads (seconds)."),
'download_format_preset': Param(
'custom',
'formats_2',
type="string",
enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'],
title="Download Format Preset",
description="Select a predefined format string or choose 'custom'. To download multiple formats, this should be a comma-separated list of format IDs (e.g., '137,140').\nformats_0: 18,140\nformats_2: 18,140,299/298/137/136/135/134/133\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
description="Select a predefined format string or choose 'custom'. To download multiple formats, this should be a comma-separated list of format IDs (e.g., '137,140').\nformats_0: 18,140\nformats_2: 18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
),
'download_format_custom': Param(
'18,140,299/298/137/136/135/134/133',
@ -1555,18 +1504,15 @@ with DAG(
description="Custom yt-dlp format string. Used when preset is 'custom'. To download multiple formats, provide a comma-separated list of format IDs (e.g., '137,140')."
),
'downloader': Param(
'default',
'cli',
type="string",
enum=['default', 'aria2c'],
title="Downloader",
description="Choose the downloader for yt-dlp."
),
'downloader_args_aria2c': Param(
'aria2c:-x 4 -k 2M --max-download-limit=3M',
type="string",
title="Aria2c Downloader Arguments",
description="Arguments to pass to yt-dlp's --downloader-args. Used when downloader is 'aria2c'."
enum=['py', 'aria-rpc', 'cli'],
title="Download Tool",
description="Choose the download tool to use: 'py' (native python, recommended), 'aria-rpc' (send to aria2c daemon), 'cli' (legacy yt-dlp wrapper)."
),
'aria_host': Param('172.17.0.1', type="string", title="Aria2c Host", description="For 'aria-rpc' downloader: Host of the aria2c RPC server."),
'aria_port': Param(6800, type="integer", title="Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server."),
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="Aria2c Secret", description="For 'aria-rpc' downloader: Secret token."),
'yt_dlp_extra_args': Param(
'',
type=["string", "null"],

View File

@ -72,10 +72,10 @@ DEFAULT_REQUEST_PARAMS_JSON = """{
# Default settings
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_TOTAL_WORKERS = 3
DEFAULT_TOTAL_WORKERS = 8
DEFAULT_WORKERS_PER_BUNCH = 1
DEFAULT_WORKER_DELAY_S = 5
DEFAULT_BUNCH_DELAY_S = 20
DEFAULT_WORKER_DELAY_S = 1
DEFAULT_BUNCH_DELAY_S = 1
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
@ -283,7 +283,7 @@ with DAG(
# --- Worker Passthrough Parameters ---
'on_bannable_failure': Param(
'stop_loop_on_auth_proceed_on_download_error',
'proceed_loop_under_manual_inspection',
type="string",
enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error', 'proceed_loop_under_manual_inspection', 'stop_loop_on_auth_proceed_on_download_error'],
title="[Worker Param] On Bannable Failure Policy",
@ -294,37 +294,16 @@ with DAG(
"'proceed_loop_under_manual_inspection': **BEWARE: MANUAL SUPERVISION REQUIRED.** Marks the URL as failed but continues the processing loop. Use this only when you can manually intervene by pausing the dispatcher DAG or creating a lock file (`/opt/airflow/inputfiles/AIRFLOW.PREVENT_URL_PULL.lockfile`) to prevent a runaway failure loop."
"'stop_loop_on_auth_proceed_on_download_error': **(Default)** Stops the loop on an authentication/token error (like 'stop_loop'), but continues the loop on a download/probe error (like 'proceed...')."
),
'request_params_json': Param(DEFAULT_REQUEST_PARAMS_JSON, type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service.", render_kwargs={"rows": 20, "cols": 120}),
'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."),
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
'clients': Param(
'mweb,web_camoufox,tv',
'tv_simply',
type="string",
enum=[
'mweb,web_camoufox,tv',
'tv_simply',
'mweb',
'web_camoufox',
'tv',
'custom',
'tv,web_safari,mweb,web_camoufox',
'web_safari',
'web',
'web_embedded',
'web_music',
'web_creator',
'web_safari_camoufox',
'web_embedded_camoufox',
'web_music_camoufox',
'web_creator_camoufox',
'mweb_camoufox',
'android',
'android_music',
'android_creator',
'android_vr',
'ios',
'ios_music',
'ios_creator',
'tv_simply',
'tv_embedded',
],
title="[Worker Param] Clients",
description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details."

View File

@ -37,10 +37,10 @@ logger = logging.getLogger(__name__)
# Default settings
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_TOTAL_WORKERS = 3
DEFAULT_TOTAL_WORKERS = 8
DEFAULT_WORKERS_PER_BUNCH = 1
DEFAULT_WORKER_DELAY_S = 5
DEFAULT_BUNCH_DELAY_S = 20
DEFAULT_WORKER_DELAY_S = 1
DEFAULT_BUNCH_DELAY_S = 1
# --- Helper Functions ---
@ -260,27 +260,24 @@ with DAG(
type="string",
enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'],
title="[Worker Param] Download Format Preset",
description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformats_0: 18,140\nformats_2: 18,140,299/298/137/136/135/134/133\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformats_0: 18,140\nformats_2: 18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
),
'download_format_custom': Param(
'18,140,299/298/137/136/135/134/133',
'18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy',
type="string",
title="[Worker Param] Custom Download Format",
description="Custom yt-dlp format string. Used when preset is 'custom'. E.g., 'ba[ext=m4a]/bestaudio/best'."
),
'downloader': Param(
'default',
'cli',
type="string",
enum=['default', 'aria2c'],
title="[Worker Param] Downloader",
description="Choose the downloader for yt-dlp."
),
'downloader_args_aria2c': Param(
'aria2c:-x 4 -k 2M --max-download-limit=3M',
type="string",
title="[Worker Param] Aria2c Downloader Arguments",
description="Arguments to pass to yt-dlp's --downloader-args. Used when downloader is 'aria2c'."
enum=['py', 'aria-rpc', 'cli'],
title="[Worker Param] Download Tool",
description="Choose the download tool to use: 'py' (native python, recommended), 'aria-rpc' (send to aria2c daemon), 'cli' (legacy yt-dlp wrapper)."
),
'aria_host': Param('172.17.0.1', type="string", title="[Worker Param] Aria2c Host", description="For 'aria-rpc' downloader: Host of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_HOST'."),
'aria_port': Param(6800, type="integer", title="[Worker Param] Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_PORT'."),
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="[Worker Param] Aria2c Secret", description="For 'aria-rpc' downloader: Secret token. Can be set via Airflow Variable 'YTDLP_ARIA_SECRET'."),
'yt_dlp_extra_args': Param(
'--restrict-filenames',
type=["string", "null"],

View File

@ -380,7 +380,10 @@ def get_url_and_assign_account(**context):
@task
def get_token(initial_data: dict, **context):
"""Makes a single attempt to get a token from the Thrift service."""
"""Makes a single attempt to get a token by calling the ytops-client get-info tool."""
import subprocess
import shlex
ti = context['task_instance']
params = context['params']
@ -388,131 +391,89 @@ def get_token(initial_data: dict, **context):
url = initial_data['url_to_process']
info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')
host, port, timeout = params['service_ip'], int(params['service_port']), int(params.get('timeout', DEFAULT_TIMEOUT))
host, port = params['service_ip'], int(params['service_port'])
machine_id = params.get('machine_id') or socket.gethostname()
clients = params.get('clients')
request_params_json = params.get('request_params_json', '{}')
assigned_proxy_url = params.get('assigned_proxy_url')
# Pretty-print the request parameters for debugging
try:
pretty_request_params = json.dumps(json.loads(request_params_json), indent=2)
logger.info(f"\n--- Request Parameters ---\n{pretty_request_params}\n--- End of Request Parameters ---")
except (json.JSONDecodeError, TypeError):
logger.warning("Could not parse request_params_json. Using raw content.")
logger.info(f"\n--- Raw Request Parameters ---\n{request_params_json}\n--- End of Raw Request Parameters ---")
video_id = _extract_video_id(url)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
job_dir_name = f"{timestamp}-{video_id or 'unknown'}"
job_dir_path = os.path.join(info_json_dir, job_dir_name)
os.makedirs(job_dir_path, exist_ok=True)
info_json_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json"
info_json_path = os.path.join(job_dir_path, info_json_filename)
# Construct Airflow log context to pass to the service
try:
from airflow.configuration import conf
remote_base = conf.get('logging', 'remote_base_log_folder')
log_path = (
f"{remote_base}/dag_id={ti.dag_id}/run_id={ti.run_id}/"
f"task_id={ti.task_id}/attempt={ti.try_number}.log"
)
airflow_log_context = AirflowLogContext(
logS3Path=log_path,
dagId=ti.dag_id,
runId=ti.run_id,
taskId=ti.task_id,
tryNumber=ti.try_number,
workerHostname=socket.gethostname(),
queue=ti.queue
)
logger.info(f"Constructed Airflow log context for yt-ops service: {airflow_log_context}")
except Exception as e:
logger.warning(f"Could not construct full Airflow log context: {e}. Creating a basic one.")
airflow_log_context = AirflowLogContext(
dagId=ti.dag_id,
runId=ti.run_id,
taskId=ti.task_id,
tryNumber=ti.try_number,
workerHostname=socket.gethostname(),
queue=ti.queue
)
cmd = [
'ytops-client', 'get-info',
'--host', host,
'--port', str(port),
'--profile', account_id,
'--output', info_json_path,
'--print-proxy',
'--verbose',
'--log-return',
]
if clients:
cmd.extend(['--client', clients])
if machine_id:
cmd.extend(['--machine-id', machine_id])
if request_params_json and request_params_json != '{}':
cmd.extend(['--request-params-json', request_params_json])
if assigned_proxy_url:
cmd.extend(['--assigned-proxy-url', assigned_proxy_url])
cmd.append(url)
logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' (Clients: {clients}) ---")
client, transport = None, None
try:
client, transport = _get_thrift_client(host, port, timeout)
token_data = client.getOrRefreshToken(
accountId=account_id,
updateType=TokenUpdateMode.AUTO,
url=url,
clients=clients,
machineId=machine_id,
airflowLogContext=airflow_log_context,
requestParamsJson=request_params_json,
assignedProxyUrl=assigned_proxy_url
)
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
logger.info(f"Executing command: {copy_paste_cmd}")
# Log a compact summary of the Thrift response, omitting large/detailed fields.
summary_token_data = copy(token_data)
if hasattr(summary_token_data, 'infoJson') and summary_token_data.infoJson:
summary_token_data.infoJson = f"... ({len(summary_token_data.infoJson)} bytes) ..."
if hasattr(summary_token_data, 'cookiesBlob') and summary_token_data.cookiesBlob:
summary_token_data.cookiesBlob = f"... ({len(summary_token_data.cookiesBlob)} bytes) ..."
# These will be logged separately below.
if hasattr(summary_token_data, 'requestSummary'):
summary_token_data.requestSummary = "..."
if hasattr(summary_token_data, 'communicationLogPaths'):
summary_token_data.communicationLogPaths = "..."
logger.info(f"Thrift service response summary: {summary_token_data}")
process = subprocess.run(cmd, capture_output=True, text=True, timeout=int(params.get('timeout', DEFAULT_TIMEOUT)))
request_summary = getattr(token_data, 'requestSummary', None)
if request_summary:
# Prepending a newline for better separation in logs.
logger.info(f"\n--- Request Summary ---\n{request_summary}")
if process.stdout:
logger.info(f"ytops-client STDOUT:\n{process.stdout}")
if process.stderr:
logger.info(f"ytops-client STDERR:\n{process.stderr}")
communication_log_paths = getattr(token_data, 'communicationLogPaths', None)
if communication_log_paths:
logger.info("--- Communication Log Paths ---")
for path in communication_log_paths:
logger.info(f" - {path}")
info_json = getattr(token_data, 'infoJson', None)
if not (info_json and json.loads(info_json)):
raise AirflowException("Service returned success but info.json was empty or invalid.")
video_id = _extract_video_id(url)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Create a unique directory for this job's artifacts
job_dir_name = f"{timestamp}-{video_id or 'unknown'}"
job_dir_path = os.path.join(info_json_dir, job_dir_name)
os.makedirs(job_dir_path, exist_ok=True)
info_json_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json"
info_json_path = os.path.join(job_dir_path, info_json_filename)
with open(info_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
return {
'info_json_path': info_json_path,
'socks_proxy': getattr(token_data, proxy_attr) if proxy_attr else None,
'ytdlp_command': getattr(token_data, 'ytdlpCommand', None),
'successful_account_id': account_id,
'original_url': url, # Include original URL for fallback
'clients': clients, # Pass clients string for accurate stats
}
except (PBServiceException, PBUserException, TTransportException) as e:
error_context = getattr(e, 'context', None)
if isinstance(error_context, str):
try: error_context = json.loads(error_context.replace("'", "\""))
except: pass
if process.returncode != 0:
error_message = "ytops-client failed. See logs for details."
for line in reversed(process.stderr.strip().split('\n')):
if 'ERROR' in line or 'Thrift error' in line or 'Connection to server failed' in line:
error_message = line.strip()
break
error_code = 'GET_INFO_CLIENT_FAIL'
if "BOT_DETECTED" in process.stderr:
error_code = "BOT_DETECTED"
elif "BOT_DETECTION_SIGN_IN_REQUIRED" in process.stderr:
error_code = "BOT_DETECTION_SIGN_IN_REQUIRED"
elif "Connection to server failed" in process.stderr:
error_code = "TRANSPORT_ERROR"
error_details = {
'error_message': getattr(e, 'message', str(e)),
'error_code': getattr(e, 'errorCode', 'TRANSPORT_ERROR'),
'proxy_url': error_context.get('proxy_url') if isinstance(error_context, dict) else None
'error_message': error_message,
'error_code': error_code,
'proxy_url': None
}
logger.error(f"Thrift call failed for account '{account_id}'. Exception: {error_details['error_message']}")
ti.xcom_push(key='error_details', value=error_details)
raise AirflowException(f"Thrift call failed: {error_details['error_message']}")
finally:
if transport and transport.isOpen():
transport.close()
raise AirflowException(f"ytops-client get-info failed: {error_message}")
proxy = None
proxy_match = re.search(r"Proxy used: (.*)", process.stderr)
if proxy_match:
proxy = proxy_match.group(1).strip()
return {
'info_json_path': info_json_path,
'socks_proxy': proxy,
'ytdlp_command': None,
'successful_account_id': account_id,
'original_url': url,
'clients': clients,
}
@task.branch
def handle_bannable_error_branch(task_id_to_check: str, **context):
@ -1135,7 +1096,7 @@ with DAG(
'prepend_client_to_account': Param(True, type="boolean", title="[Worker Param] Prepend Client to Account", description="If True, prepends client and timestamp to account names in prefix mode."),
'machine_id': Param(None, type=["string", "null"]),
'assigned_proxy_url': Param(None, type=["string", "null"], title="[Worker Param] Assigned Proxy URL", description="If provided, forces the token service to use this specific proxy for the request."),
'clients': Param('mweb', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"),
'clients': Param('tv_simply', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"),
'timeout': Param(DEFAULT_TIMEOUT, type="integer"),
'on_bannable_failure': Param('stop_loop_on_auth_proceed_on_download_error', type="string", enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error', 'proceed_loop_under_manual_inspection', 'stop_loop_on_auth_proceed_on_download_error']),
'request_params_json': Param(json.dumps(DEFAULT_REQUEST_PARAMS), type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."),

View File

@ -300,7 +300,7 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
elif format_preset == 'formats_0':
download_format = '18,140'
elif format_preset == 'formats_2':
download_format = '18,140,299/298/137/136/135/134/133'
download_format = '18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy'
elif format_preset == 'formats_3':
download_format = '18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318'
else:
@ -311,112 +311,102 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
retry_on_probe_failure = params.get('retry_on_probe_failure', False)
def run_yt_dlp_command(format_selector: str):
"""Constructs and runs a yt-dlp command, returning a list of final filenames."""
cmd = [
'yt-dlp', '--verbose', '--print-traffic', '--load-info-json', info_json_path,
'-f', format_selector, '-o', full_output_path,
'--print', 'filename', '--continue', '--no-progress', '--no-simulate',
'--no-write-info-json', '--ignore-errors', '--no-playlist',
]
if params.get('fragment_retries'):
cmd.extend(['--fragment-retries', str(params['fragment_retries'])])
if params.get('limit_rate'):
cmd.extend(['--limit-rate', params['limit_rate']])
if params.get('socket_timeout'):
cmd.extend(['--socket-timeout', str(params['socket_timeout'])])
if params.get('min_sleep_interval'):
cmd.extend(['--min-sleep-interval', str(params['min_sleep_interval'])])
if params.get('max_sleep_interval'):
cmd.extend(['--max-sleep-interval', str(params['max_sleep_interval'])])
if params.get('yt_dlp_test_mode'):
cmd.append('--test')
"""Constructs and runs a yt-ops-client download command, returning a list of final filenames."""
downloader = params.get('downloader', 'py')
cmd = ['ytops-client', 'download', downloader, '--load-info-json', info_json_path, '-f', format_selector]
downloader = params.get('downloader', 'default')
if proxy and not (downloader == 'aria2c' and proxy.startswith('socks5://')):
if proxy:
cmd.extend(['--proxy', proxy])
gost_process = None
try:
if downloader == 'aria2c':
cmd.extend(['--downloader', 'aria2c'])
downloader_args = params.get('downloader_args_aria2c')
if proxy and proxy.startswith('socks5://'):
import socket
from contextlib import closing
def find_free_port():
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
return s.getsockname()[1]
local_port = find_free_port()
http_proxy = f"http://127.0.0.1:{local_port}"
logger.info(f"Starting gost for format '{format_selector}' to forward {proxy} to {http_proxy}")
gost_cmd = ['gost', '-L', f'http://127.0.0.1:{local_port}', '-F', proxy]
gost_process = subprocess.Popen(gost_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
time.sleep(1)
if gost_process.poll() is not None:
stdout, stderr = gost_process.communicate()
logger.error(f"gost failed to start. Exit: {gost_process.returncode}. Stdout: {stdout.decode()}. Stderr: {stderr.decode()}")
raise AirflowException("gost proxy tunnel failed to start.")
user_args = downloader_args[len('aria2c:'):] if downloader_args and downloader_args.startswith('aria2c:') else (downloader_args or "")
final_args_str = f'aria2c:{user_args.strip()} --http-proxy={http_proxy}'
cmd.extend(['--downloader-args', final_args_str])
elif downloader_args:
cmd.extend(['--downloader-args', downloader_args])
if downloader == 'py':
cmd.extend(['--output-dir', download_dir])
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
py_extra_args = []
if params.get('fragment_retries'):
py_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])])
if params.get('limit_rate'):
py_extra_args.extend(['--limit-rate', params['limit_rate']])
if params.get('socket_timeout'):
py_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
if params.get('min_sleep_interval'):
py_extra_args.extend(['--sleep-interval', str(params['min_sleep_interval'])])
if params.get('max_sleep_interval'):
py_extra_args.extend(['--max-sleep-interval', str(params['max_sleep_interval'])])
if params.get('yt_dlp_test_mode'):
py_extra_args.append('--test')
extra_args = params.get('yt_dlp_extra_args')
if extra_args:
cmd.extend(shlex.split(extra_args))
if original_url:
cmd.append(original_url)
existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '')
final_extra_args = existing_extra + py_extra_args
if final_extra_args:
cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)])
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
logger.info(f"Executing yt-dlp command for format '{format_selector}': {copy_paste_cmd}")
process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
if process.stdout:
logger.info(f"yt-dlp STDOUT for format '{format_selector}':\n{process.stdout}")
if process.stderr:
# yt-dlp often prints progress and informational messages to stderr
logger.info(f"yt-dlp STDERR for format '{format_selector}':\n{process.stderr}")
elif downloader == 'aria-rpc':
cmd.extend([
'--aria-host', params.get('aria_host', '172.17.0.1'),
'--aria-port', str(params.get('aria_port', 6800)),
'--aria-secret', params.get('aria_secret'),
'--wait', '--auto-merge-fragments',
'--fragments-dir', download_dir,
'--output-dir', download_dir,
])
if params.get('yt_dlp_cleanup_mode'):
cmd.append('--cleanup')
if process.returncode != 0:
logger.error(f"yt-dlp failed for format '{format_selector}' with exit code {process.returncode}")
# STDOUT and STDERR are already logged above.
raise AirflowException(f"yt-dlp command failed for format '{format_selector}'.")
elif downloader == 'cli':
cmd.extend(['--output-dir', download_dir])
# The 'cli' tool is the old yt-dlp wrapper, so it takes similar arguments.
cli_extra_args = []
if params.get('fragment_retries'):
cli_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])])
if params.get('limit_rate'):
cli_extra_args.extend(['--limit-rate', params['limit_rate']])
if params.get('socket_timeout'):
cli_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
if params.get('min_sleep_interval'):
cli_extra_args.extend(['--sleep-interval', str(params['min_sleep_interval'])])
if params.get('max_sleep_interval'):
cli_extra_args.extend(['--max-sleep-interval', str(params['max_sleep_interval'])])
if params.get('yt_dlp_test_mode'):
cli_extra_args.append('--test')
# In test mode, files are not created, so we only check that yt-dlp returned filenames.
# Otherwise, we verify that the files actually exist on disk.
output_files = [f for f in process.stdout.strip().split('\n') if f]
if not params.get('yt_dlp_test_mode'):
output_files = [f for f in output_files if os.path.exists(f)]
existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '')
final_extra_args = existing_extra + cli_extra_args
if final_extra_args:
cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)])
if not output_files:
log_msg = (f"Test run for format '{format_selector}' did not produce any filenames."
if params.get('yt_dlp_test_mode') else
f"Download for format '{format_selector}' finished but no output files exist.")
exc_msg = (f"Test run for format '{format_selector}' did not produce any filenames."
if params.get('yt_dlp_test_mode') else
f"Download for format '{format_selector}' did not produce a file.")
logger.error(log_msg)
logger.error(f"Full STDOUT:\n{process.stdout}")
logger.error(f"Full STDERR:\n{process.stderr}")
raise AirflowException(exc_msg)
log_prefix = "SUCCESS (Test Mode):" if params.get('yt_dlp_test_mode') else "SUCCESS:"
logger.info(f"{log_prefix} Command for format '{format_selector}' complete. Files: {output_files}")
return output_files
finally:
if gost_process:
logger.info(f"Terminating gost process (PID: {gost_process.pid}) for format '{format_selector}'.")
gost_process.terminate()
try:
gost_process.wait(timeout=5)
except subprocess.TimeoutExpired:
gost_process.kill()
gost_process.wait()
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
logger.info(f"Executing download command for format '{format_selector}': {copy_paste_cmd}")
process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
if process.stdout:
logger.info(f"Download tool STDOUT for format '{format_selector}':\n{process.stdout}")
if process.stderr:
logger.info(f"Download tool STDERR for format '{format_selector}':\n{process.stderr}")
if process.returncode != 0:
logger.error(f"Download tool failed for format '{format_selector}' with exit code {process.returncode}")
raise AirflowException(f"Download command failed for format '{format_selector}'. See logs for details.")
output_files = []
for line in process.stdout.strip().split('\n'):
# For aria-rpc, parse "Download and merge successful: <path>" or "Download successful: <path>"
match = re.search(r'successful: (.+)', line)
if match:
filepath = match.group(1).strip()
if os.path.exists(filepath):
output_files.append(filepath)
else:
logger.warning(f"File path from aria-rpc output does not exist locally: '{filepath}'")
# For py/cli, it's just the path
elif os.path.exists(line.strip()):
output_files.append(line.strip())
if not params.get('yt_dlp_test_mode') and not output_files:
raise AirflowException(f"Download for format '{format_selector}' finished but no output files were found or exist.")
log_prefix = "SUCCESS (Test Mode):" if params.get('yt_dlp_test_mode') else "SUCCESS:"
logger.info(f"{log_prefix} Command for format '{format_selector}' complete. Files: {output_files}")
return output_files
def run_ffmpeg_probe(filename):
"""Probes a file with ffmpeg to check for corruption."""
@ -824,7 +814,7 @@ with DAG(
type="string",
enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'],
title="Download Format Preset",
description="Select a predefined format string or choose 'custom'. To download multiple formats, this should be a comma-separated list of format IDs (e.g., '137,140').\nformats_0: 18,140\nformats_2: 18,140,299/298/137/136/135/134/133\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
description="Select a predefined format string or choose 'custom'. To download multiple formats, this should be a comma-separated list of format IDs (e.g., '137,140').\nformats_0: 18,140\nformats_2: 18-dashy,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
),
'download_format_custom': Param(
'ba[ext=m4a]/bestaudio/best',
@ -833,18 +823,15 @@ with DAG(
description="Custom yt-dlp format string. Used when preset is 'custom'. To download multiple formats, provide a comma-separated list of format IDs (e.g., '137,140')."
),
'downloader': Param(
'default',
'cli',
type="string",
enum=['default', 'aria2c'],
title="Downloader",
description="Choose the downloader for yt-dlp."
),
'downloader_args_aria2c': Param(
'aria2c:-x 4 -k 2M --max-download-limit=3M',
type="string",
title="Aria2c Downloader Arguments",
description="Arguments to pass to yt-dlp's --downloader-args. Used when downloader is 'aria2c'."
enum=['py', 'aria-rpc', 'cli'],
title="Download Tool",
description="Choose the download tool to use: 'py' (native python, recommended), 'aria-rpc' (send to aria2c daemon), 'cli' (legacy yt-dlp wrapper)."
),
'aria_host': Param('172.17.0.1', type="string", title="Aria2c Host", description="For 'aria-rpc' downloader: Host of the aria2c RPC server."),
'aria_port': Param(6800, type="integer", title="Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server."),
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="Aria2c Secret", description="For 'aria-rpc' downloader: Secret token."),
'yt_dlp_extra_args': Param(
'--no-part --restrict-filenames',
type=["string", "null"],

View File

@ -5,6 +5,9 @@
vars_files:
- "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
- "{{ inventory_dir }}/group_vars/all/vault.yml"
vars:
envoy_port: 9980
envoy_admin_port: 9981
pre_tasks:
- name: Announce master deployment
debug:

1
ytops_client/__init__.py Normal file
View File

@ -0,0 +1 @@
# This file makes 'ytops_client' a Python package.

88
ytops_client/cli.py Normal file
View File

@ -0,0 +1,88 @@
#!/usr/bin/env python3
import sys
import argparse
# Import the functions that define and execute the logic for each subcommand
from .list_formats_tool import add_list_formats_parser, main_list_formats
from .get_info_tool import add_get_info_parser, main_get_info
from .download_tool import add_download_parser, main_download
from .stress_policy_tool import add_stress_policy_parser, main_stress_policy
from .stress_formats_tool import add_stress_formats_parser, main_stress_formats
from .cookie_tool import add_cookie_tool_parser, main_cookie_tool
from .download_aria_tool import add_download_aria_parser, main_download_aria
from .download_native_py_tool import add_download_native_py_parser, main_download_native_py
def main():
"""
Main entry point for the yt-ops-client CLI.
Parses arguments and dispatches to the appropriate subcommand function.
"""
# Workaround for argparse behavior with positional arguments that start with a hyphen.
# If the command is 'get-info' and the last argument looks like a video ID
# starting with a '-', we insert '--' before it to tell argparse to treat it
# as a positional argument, not an option. This assumes the URL is the last argument.
if len(sys.argv) >= 3 and sys.argv[1] == 'get-info':
last_arg = sys.argv[-1]
# A YouTube video ID is 11 characters.
if last_arg.startswith('-') and len(last_arg) == 11:
import re
if re.fullmatch(r'-[a-zA-Z0-9_-]{10}', last_arg):
sys.argv.insert(len(sys.argv) - 1, '--')
parser = argparse.ArgumentParser(
description="YT Ops Client Tools",
formatter_class=argparse.RawTextHelpFormatter
)
subparsers = parser.add_subparsers(dest='command', help='Available sub-commands')
# Add subparsers from each tool module
add_list_formats_parser(subparsers)
add_get_info_parser(subparsers)
# Create a top-level 'download' command with its own subcommands
download_parser = subparsers.add_parser(
'download',
help='Download using different methods.',
description='Provides access to various download tools. Use "download <method> --help" for details.'
)
download_subparsers = download_parser.add_subparsers(dest='download_command', help='Available downloaders', required=True)
add_download_parser(download_subparsers) # Adds 'cli' subcommand
add_download_native_py_parser(download_subparsers) # Adds 'py' subcommand
add_download_aria_parser(download_subparsers) # Adds 'aria-rpc' subcommand
add_stress_policy_parser(subparsers)
add_stress_formats_parser(subparsers)
add_cookie_tool_parser(subparsers)
args = parser.parse_args()
# If no command is provided, print help and exit.
if not args.command:
parser.print_help()
return 1
# Dispatch to the correct main function based on the command
if args.command == 'list-formats':
return main_list_formats(args)
elif args.command == 'get-info':
return main_get_info(args)
elif args.command == 'download':
if args.download_command == 'cli':
return main_download(args)
elif args.download_command == 'py':
return main_download_native_py(args)
elif args.download_command == 'aria-rpc':
return main_download_aria(args)
elif args.command == 'stress-policy':
return main_stress_policy(args)
elif args.command == 'stress-formats':
return main_stress_formats(args)
elif args.command == 'convert-cookies':
return main_cookie_tool(args)
# This path should not be reachable if a command is required or handled above.
parser.print_help()
return 1
if __name__ == "__main__":
sys.exit(main())

139
ytops_client/cookie_tool.py Normal file
View File

@ -0,0 +1,139 @@
#!/usr/bin/env python3
"""
Tool to convert JSON cookies to the standard Netscape txt format.
"""
import argparse
import json
import sys
import logging
# Configure logging
logger = logging.getLogger('cookie_tool')
def convert_json_to_netscape(json_data):
"""
Converts a list of cookie dictionaries to a Netscape format string.
"""
netscape_cookies = []
# The header is optional but good practice for some tools.
netscape_cookies.append("# Netscape HTTP Cookie File")
netscape_cookies.append("# http://www.netscape.com/newsref/std/cookie_spec.html")
netscape_cookies.append("# This is a generated file! Do not edit.")
netscape_cookies.append("")
if not isinstance(json_data, list):
raise TypeError("Input JSON must be a list of cookie objects.")
for cookie in json_data:
if not isinstance(cookie, dict):
logger.warning(f"Skipping non-dictionary item in JSON list: {cookie}")
continue
domain = cookie.get('domain', '')
# The 'hostOnly' flag determines if the domain is accessible to subdomains.
# Netscape format's flag is TRUE if subdomains can access it.
# So, hostOnly=false means flag=TRUE.
# A leading dot in the domain also implies this for some implementations.
if domain.startswith('.'):
include_subdomains = 'TRUE'
else:
include_subdomains = 'FALSE' if cookie.get('hostOnly', True) else 'TRUE'
path = cookie.get('path', '/')
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
# Expiration date. If session cookie or no expiration, use 0.
if cookie.get('session', False) or 'expirationDate' not in cookie or cookie['expirationDate'] is None:
expires = 0
else:
expires = int(cookie['expirationDate'])
name = cookie.get('name', '')
value = str(cookie.get('value', ''))
# Skip cookies without essential fields
if not domain or not name:
logger.warning(f"Skipping cookie with missing domain or name: {cookie}")
continue
netscape_cookies.append(
f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}"
)
return "\n".join(netscape_cookies)
def add_cookie_tool_parser(subparsers):
"""Add the parser for the 'convert-cookies' command."""
parser = subparsers.add_parser(
'convert-cookies',
description='Convert JSON cookies to Netscape format.',
formatter_class=argparse.RawTextHelpFormatter,
help='Convert JSON cookies to Netscape format.',
epilog="""
Reads a JSON array of cookie objects from stdin and prints the
Netscape cookie file format to stdout.
Example JSON input format (per cookie):
{
"domain": ".example.com",
"hostOnly": false,
"path": "/",
"secure": true,
"expirationDate": 1672531199,
"name": "my_cookie",
"value": "my_value"
}
Example usage:
cat cookies.json | yt-ops-client convert-cookies > cookies.txt
"""
)
parser.add_argument(
'input_file',
nargs='?',
type=argparse.FileType('r', encoding='utf-8'),
default=sys.stdin,
help="Path to the JSON cookie file. Reads from stdin if not provided."
)
parser.add_argument(
'-o', '--output',
type=argparse.FileType('w', encoding='utf-8'),
default=sys.stdout,
help="Output file path for the Netscape cookies. Defaults to stdout."
)
parser.add_argument('--verbose', action='store_true', help='Enable verbose logging.')
return parser
def main_cookie_tool(args):
"""Main logic for the 'convert-cookies' command."""
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s', stream=sys.stderr)
else:
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s', stream=sys.stderr)
try:
json_content = args.input_file.read()
if not json_content.strip():
logger.error("Input is empty.")
return 1
cookie_data = json.loads(json_content)
netscape_string = convert_json_to_netscape(cookie_data)
args.output.write(netscape_string + '\n')
if args.output is not sys.stdout:
logger.info(f"Successfully converted cookies to {args.output.name}")
return 0
except json.JSONDecodeError:
logger.error("Invalid JSON provided. Please check the input file.")
return 1
except TypeError as e:
logger.error(f"Error processing JSON: {e}")
return 1
except Exception as e:
logger.error(f"An unexpected error occurred: {e}", exc_info=args.verbose)
return 1

View File

@ -0,0 +1,687 @@
#!/usr/bin/env python3
"""
Tool to send a download to an aria2c daemon via RPC.
"""
import argparse
import json
import logging
import sys
import os
import glob
import shutil
import re
import shlex
import time
from urllib.parse import urljoin
try:
import aria2p
from aria2p.utils import human_readable_bytes
except ImportError:
print("aria2p is not installed. Please install it with: pip install aria2p", file=sys.stderr)
sys.exit(1)
logger = logging.getLogger('download_aria_tool')
class TimeoutError(Exception):
pass
def add_download_aria_parser(subparsers):
"""Add the parser for the 'download aria-rpc' command."""
parser = subparsers.add_parser(
'aria-rpc',
description='Send a download to an aria2c daemon via RPC, using an info.json from stdin or a file.',
formatter_class=argparse.RawTextHelpFormatter,
help='Download a specific format using aria2c RPC.',
epilog="""
Usage Notes for Fragmented Downloads (e.g., DASH):
To download and automatically merge fragmented formats, you must:
1. Use '--wait' to make the operation synchronous.
2. Use '--auto-merge-fragments' to enable the merge logic.
3. Ensure this script has access to the directory where aria2c saves files.
Example for a remote aria2c daemon:
- The remote daemon saves files to '/srv/downloads' on its machine.
- This directory is mounted locally at '/mnt/remote_aria2_downloads'.
cat latest-info.json | yt-ops-client download aria-rpc -f "299/137" \\
--wait --auto-merge-fragments \\
--remote-dir /srv/downloads \\
--fragments-dir /mnt/remote_aria2_downloads
"""
)
parser.add_argument('--load-info-json', type=argparse.FileType('r', encoding='utf-8'), help="Path to the info.json file. If not provided, reads from stdin.")
parser.add_argument('-f', '--format', required=True, help='The format ID to download. Supports yt-dlp style format selectors (e.g., "137/136,140").')
parser.add_argument('--output-dir', help='Local directory to save the final merged file. Defaults to the current directory.')
parser.add_argument('--fragments-dir', help='The local path where this script should look for downloaded fragments. If the aria2c daemon is remote, this should be a local mount point corresponding to --remote-dir. Defaults to --output-dir.')
parser.add_argument('--remote-dir', help='The absolute path to the download directory on the remote aria2c host. This is passed via RPC.')
parser.add_argument('--aria-host', default='localhost', help='The host of the aria2c RPC server. Default: localhost.')
parser.add_argument('--aria-port', type=int, default=6800, help='The port of the aria2c RPC server. Default: 6800.')
parser.add_argument('--aria-secret', help='The secret token for the aria2c RPC server (often required, e.g., "SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX").')
parser.add_argument('--proxy', help='Proxy to use for the download, e.g., "socks5://127.0.0.1:1080".')
parser.add_argument('--downloader-args', help='Arguments for aria2c, in yt-dlp format (e.g., "aria2c:[-x 8, -k 1M]").')
parser.add_argument('--wait', action='store_true', help='Wait for the download to complete and report its status. Note: This makes the operation synchronous and will block until the download finishes.')
parser.add_argument('--wait-timeout', help='Timeout in seconds for waiting on downloads. Use "auto" to calculate based on a minimum speed of 200KiB/s. Requires --wait. Default: no timeout.')
parser.add_argument('--auto-merge-fragments', action='store_true', help='Automatically merge fragments after download. Requires --wait and assumes the script has filesystem access to the aria2c host.')
parser.add_argument('--remove-fragments-after-merge', action='store_true', help='Delete individual fragment files after a successful merge. Requires --auto-merge-fragments.')
parser.add_argument('--cleanup', action='store_true', help='After a successful download, remove the final file(s) from the filesystem. For fragmented downloads, this implies --remove-fragments-after-merge.')
parser.add_argument('--remove-on-complete', action=argparse.BooleanOptionalAction, default=True, help='Remove the download from aria2c history on successful completion. Use --no-remove-on-complete to disable. May fail on older aria2c daemons.')
parser.add_argument('--purge-on-complete', action='store_true', help='Use aria2.purgeDownloadResult to clear ALL completed/failed downloads from history on success. Use as a workaround for older daemons.')
parser.add_argument('--verbose', action='store_true', help='Enable verbose output for this script.')
return parser
def cleanup_aria_download(api, downloads):
"""Pause and remove downloads from aria2c."""
if not downloads:
return
try:
logger.info(f"Attempting to clean up {len(downloads)} download(s) from aria2c...")
# Filter out downloads that might already be gone
valid_downloads = [d for d in downloads if hasattr(d, 'gid')]
if not valid_downloads:
logger.info("No valid downloads to clean up.")
return
api.pause(valid_downloads)
# Give aria2c a moment to process the pause command before removing
time.sleep(0.5)
api.remove(valid_downloads)
logger.info("Cleanup successful.")
except Exception as e:
logger.warning(f"An error occurred during aria2c cleanup: {e}")
def parse_aria_error(download):
"""Parses an aria2p Download object to get a detailed error message."""
error_code = download.error_code
error_message = download.error_message
if not error_message:
return f"Unknown aria2c error (Code: {error_code})"
# Check for common HTTP errors in the message
http_status_match = re.search(r'HTTP status (\d+)', error_message)
if http_status_match:
status_code = int(http_status_match.group(1))
if status_code == 403:
return f"HTTP Error 403: Forbidden. The URL may have expired or requires valid cookies/headers."
elif status_code == 404:
return f"HTTP Error 404: Not Found. The resource is unavailable."
else:
return f"HTTP Error {status_code}."
if "Timeout" in error_message or "timed out" in error_message.lower():
return "Download timed out."
# Fallback to the raw error message
return f"Aria2c error (Code: {error_code}): {error_message}"
def parse_aria_args_to_options(args_str):
"""
Parses yt-dlp style downloader args for aria2c.
Example: "aria2c:[-x 8, -k 1M]" or just "-x 8 -k 1M"
Returns a dictionary of options for aria2p.
"""
if not args_str or not args_str.strip():
return {}
inner_args_str = args_str.strip()
match = re.match(r'aria2c:\s*\[(.*)\]', inner_args_str)
if match:
# Handle yt-dlp's format
inner_args_str = match.group(1).replace(',', ' ')
else:
# If it doesn't match, assume the whole string is a set of arguments.
logger.debug(f"Downloader args '{args_str}' does not match 'aria2c:[...]' format. Parsing as a raw argument string.")
arg_list = shlex.split(inner_args_str)
# Use a mini-parser to handle CLI-style args
parser = argparse.ArgumentParser(add_help=False, prog="aria2c_args_parser")
parser.add_argument('-x', '--max-connection-per-server')
parser.add_argument('-k', '--min-split-size')
parser.add_argument('-s', '--split')
parser.add_argument('--all-proxy')
try:
# We only care about known arguments
known_args, unknown_args = parser.parse_known_args(arg_list)
if unknown_args:
logger.warning(f"Ignoring unknown arguments in --downloader-args: {unknown_args}")
# Convert to dict, removing None values
return {k: v for k, v in vars(known_args).items() if v is not None}
except Exception:
logger.warning(f"Failed to parse arguments inside --downloader-args: '{inner_args_str}'")
return {}
def main_download_aria(args):
"""Main logic for the 'download-aria' command."""
log_level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', stream=sys.stderr)
if args.remove_fragments_after_merge and not args.auto_merge_fragments:
logger.error("--remove-fragments-after-merge requires --auto-merge-fragments.")
return 1
if args.auto_merge_fragments and not args.wait:
logger.error("--auto-merge-fragments requires --wait.")
return 1
if args.wait_timeout and not args.wait:
logger.error("--wait-timeout requires --wait.")
return 1
if args.wait:
logger.info("Will wait for download to complete and report status. This is a synchronous operation.")
else:
logger.info("Will submit download and exit immediately (asynchronous).")
info_json_content = ""
input_source_name = ""
if args.load_info_json:
info_json_content = args.load_info_json.read()
input_source_name = args.load_info_json.name
else:
info_json_content = sys.stdin.read()
input_source_name = "stdin"
if not info_json_content.strip():
logger.error(f"Failed to read info.json from {input_source_name}. Input is empty.")
return 1
try:
info_data = json.loads(info_json_content)
logger.info(f"Successfully loaded info.json from {input_source_name}.")
except json.JSONDecodeError:
logger.error(f"Failed to parse info.json from {input_source_name}. Is the input valid JSON?")
return 1
# Find the requested format, supporting yt-dlp style selectors
target_format = None
# A format selector can be a comma-separated list of preferences,
# where each preference can be a slash-separated list of format_ids.
# e.g., "299/137/136,140" means try 299, then 137, then 136, then 140.
format_preferences = [item.strip() for sublist in (i.split('/') for i in args.format.split(',')) for item in sublist if item.strip()]
available_formats_map = {f['format_id']: f for f in info_data.get('formats', []) if 'format_id' in f}
for format_id in format_preferences:
if format_id in available_formats_map:
target_format = available_formats_map[format_id]
logger.info(f"Selected format ID '{format_id}' from selector '{args.format}'.")
break
if not target_format:
logger.error(f"No suitable format found for selector '{args.format}' in info.json.")
return 1
# Get file size for auto-timeout and dynamic options
total_filesize = target_format.get('filesize') or target_format.get('filesize_approx')
# Construct filename
video_id = info_data.get('id', 'unknown_video_id')
title = info_data.get('title', 'unknown_title')
ext = target_format.get('ext', 'mp4')
# Sanitize title for filename
safe_title = "".join([c for c in title if c.isalpha() or c.isdigit() or c in (' ', '-', '_')]).rstrip()
filename = f"{safe_title} [{video_id}].f{target_format['format_id']}.{ext}"
# Prepare options for aria2
aria_options = {
# Options from yt-dlp's aria2c integration for performance and reliability
'max-connection-per-server': 16,
'split': 16,
'min-split-size': '1M',
'http-accept-gzip': 'true',
'file-allocation': 'none',
}
if args.proxy:
aria_options['all-proxy'] = args.proxy
custom_options = parse_aria_args_to_options(args.downloader_args)
# Dynamically set min-split-size if not overridden by user
if 'min_split_size' not in custom_options and total_filesize:
if total_filesize > 100 * 1024 * 1024: # 100 MiB
aria_options['min-split-size'] = '5M'
logger.info("File is > 100MiB, dynamically setting min-split-size to 5M.")
if custom_options:
aria_options.update(custom_options)
logger.info(f"Applied custom aria2c options from --downloader-args: {custom_options}")
aria_options['out'] = filename
# Add headers from info.json, mimicking yt-dlp's behavior for aria2c
headers = target_format.get('http_headers')
if headers:
header_list = [f'{key}: {value}' for key, value in headers.items()]
aria_options['header'] = header_list
logger.info(f"Adding {len(header_list)} HTTP headers to the download.")
if args.verbose:
for h in header_list:
if h.lower().startswith('cookie:'):
logger.debug(f" Header: Cookie: [REDACTED]")
else:
logger.debug(f" Header: {h}")
is_fragmented = 'fragments' in target_format
if not is_fragmented:
url = target_format.get('url')
if not url:
logger.error(f"Format ID '{args.format}' has neither a URL nor fragments.")
return 1
try:
logger.info(f"Connecting to aria2c RPC at http://{args.aria_host}:{args.aria_port}")
client = aria2p.Client(
host=f"http://{args.aria_host}",
port=args.aria_port,
secret=args.aria_secret or ""
)
api = aria2p.API(client)
timeout_seconds = None
if args.wait_timeout:
if args.wait_timeout.lower() == 'auto':
if total_filesize:
# Min speed: 200 KiB/s. Min timeout: 30s.
min_speed = 200 * 1024
calculated_timeout = int(total_filesize / min_speed)
timeout_seconds = max(30, calculated_timeout)
total_filesize_hr, _ = human_readable_bytes(total_filesize)
logger.info(f"Auto-calculated timeout: {timeout_seconds}s (based on {total_filesize_hr} at 200KiB/s).")
else:
logger.warning("Cannot use 'auto' timeout: file size not available in info.json. Timeout disabled.")
else:
try:
timeout_seconds = int(args.wait_timeout)
if timeout_seconds <= 0:
raise ValueError
except ValueError:
logger.error(f"Invalid --wait-timeout value: '{args.wait_timeout}'. Must be a positive integer or 'auto'.")
return 1
if is_fragmented:
return download_fragments_aria(args, api, target_format, filename, aria_options, timeout_seconds, remote_dir=args.remote_dir)
else:
return download_url_aria(args, api, url, filename, aria_options, timeout_seconds, remote_dir=args.remote_dir)
except Exception as e:
logger.error(f"An error occurred while communicating with aria2c: {e}", exc_info=args.verbose)
return 1
def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, remote_dir=None):
"""Handle downloading a single URL with aria2c."""
if remote_dir:
aria_options['dir'] = remote_dir
logger.info(f"Adding download for format '{args.format}' with URL: {url[:70]}...")
downloads = api.add_uris([url], options=aria_options)
if not downloads:
logger.error("Failed to add download to aria2c. The API returned an empty result.")
return 1
# Handle older aria2p versions that return a single Download object instead of a list
download = downloads[0] if isinstance(downloads, list) else downloads
logger.info(f"Successfully added download to aria2c. GID: {download.gid}")
if args.wait:
logger.info(f"Waiting for download {download.gid} to complete...")
start_time = time.time()
try:
while True:
if timeout_seconds and (time.time() - start_time > timeout_seconds):
raise TimeoutError(f"Download did not complete within {timeout_seconds}s timeout.")
# Re-fetch the download object to get the latest status
download.update()
# A download is no longer active if it's complete, errored, paused, or removed.
if download.status not in ('active', 'waiting'):
break
progress_info = (
f"\rGID {download.gid}: {download.status} "
f"{download.progress_string()} "
f"({download.download_speed_string()}) "
f"ETA: {download.eta_string()}"
)
sys.stdout.write(progress_info)
sys.stdout.flush()
time.sleep(0.5)
except (KeyboardInterrupt, TimeoutError) as e:
sys.stdout.write('\n')
if isinstance(e, KeyboardInterrupt):
logger.warning("Wait interrupted by user. Cleaning up download...")
cleanup_aria_download(api, [download])
return 130
else: # TimeoutError
logger.error(f"Download timed out. Cleaning up... Error: {e}")
cleanup_aria_download(api, [download])
return 1
except aria2p.ClientException as e:
# This can happen if the download completes and is removed by aria2c
# before we can check its final status. Assume success in this case.
logger.warning(f"Could not get final status for GID {download.gid} (maybe removed on completion?): {e}. Assuming success.")
print(f"Download for GID {download.gid} presumed successful.")
return 0
sys.stdout.write('\n') # Newline after progress bar
# Final status check (no need to update again, we have the latest status)
if download.status == 'complete':
logger.info(f"Download {download.gid} completed successfully.")
downloaded_filepath_remote = None
if download.files:
downloaded_filepath_remote = download.files[0].path
print(f"Download successful: {downloaded_filepath_remote}")
else:
print("Download successful, but no file path reported by aria2c.")
if args.cleanup and downloaded_filepath_remote:
local_filepath = None
# To map remote path to local, we need remote_dir and a local equivalent.
# We'll use fragments_dir as the local equivalent, which defaults to output_dir.
local_base_dir = args.fragments_dir or args.output_dir or '.'
if remote_dir:
if downloaded_filepath_remote.startswith(remote_dir):
relative_path = os.path.relpath(downloaded_filepath_remote, remote_dir)
local_filepath = os.path.join(local_base_dir, relative_path)
else:
logger.warning(f"Cleanup: Downloaded file path '{downloaded_filepath_remote}' does not start with remote-dir '{remote_dir}'. Cannot map to local path.")
else:
logger.warning(f"Cleanup: --remote-dir not specified. Assuming download path is accessible locally as '{downloaded_filepath_remote}'.")
local_filepath = downloaded_filepath_remote
if local_filepath:
try:
if os.path.exists(local_filepath):
os.remove(local_filepath)
logger.info(f"Cleanup: Removed downloaded file '{local_filepath}'")
else:
logger.warning(f"Cleanup: File not found at expected local path '{local_filepath}'. Skipping removal.")
except OSError as e:
logger.error(f"Cleanup failed: Could not remove file '{local_filepath}': {e}")
elif args.cleanup:
logger.warning("Cleanup requested, but no downloaded file path was reported by aria2c.")
if args.purge_on_complete:
try:
api.purge_download_result()
logger.info("Purged all completed/failed downloads from aria2c history.")
except Exception as e:
logger.warning(f"Failed to purge download history: {e}")
elif args.remove_on_complete:
try:
api.remove_download_result(download)
logger.info(f"Removed download {download.gid} from aria2c history.")
except Exception as e:
logger.warning(f"Failed to remove download {download.gid} from history: {e}")
return 0
else:
detailed_error = parse_aria_error(download)
logger.error(f"Download {download.gid} failed. Error: {detailed_error}")
return 1
else:
print(f"Successfully added download. GID: {download.gid}")
return 0
def download_fragments_aria(args, api, target_format, filename, aria_options, timeout_seconds, remote_dir=None):
"""Handle downloading fragmented formats with aria2c."""
logger.info(f"Format '{args.format}' is fragmented. Adding all fragments to download queue.")
fragment_base_url = target_format.get('fragment_base_url')
fragments = target_format['fragments']
MAX_FRAGMENTS = 50000
if len(fragments) > MAX_FRAGMENTS:
logger.error(
f"The number of fragments ({len(fragments)}) exceeds the safety limit of {MAX_FRAGMENTS}. "
f"This is to prevent overwhelming the aria2c server. Aborting."
)
return 1
# We need to set the 'dir' option for all fragments if specified.
# The 'out' option will be set per-fragment.
frag_aria_options = aria_options.copy()
frag_aria_options.pop('out', None) # Remove the main 'out' option
if remote_dir:
frag_aria_options['dir'] = remote_dir
logger.info(f"Instructing remote aria2c to save fragments to: {remote_dir}")
base_filename, file_ext = os.path.splitext(filename)
calls = []
for i, fragment in enumerate(fragments):
frag_url = fragment.get('url')
if not frag_url:
if not fragment_base_url:
logger.error(f"Fragment {i} has no URL and no fragment_base_url is available. Aborting.")
return 1
frag_url = urljoin(fragment_base_url, fragment['path'])
# Use the base filename from the main file, but add fragment identifier
fragment_filename = f"{base_filename}-Frag{i}{file_ext}"
current_frag_options = frag_aria_options.copy()
current_frag_options['out'] = os.path.basename(fragment_filename)
# Prepare parameters for multicall in the format:
# {"methodName": "aria2.addUri", "params": [["url"], {"out": "file.mp4"}]}
# The secret token is automatically added by aria2p.
params = [[frag_url], current_frag_options]
call_struct = {
"methodName": api.client.ADD_URI,
"params": params
}
calls.append(call_struct)
results = api.client.multicall(calls)
if not results:
logger.error("Failed to add fragments to aria2c. The API returned an empty result.")
return 1
# The result of a multicall of addUri is a list of lists, where each inner list
# contains the GID of one download, e.g., [['gid1'], ['gid2']].
# A failed call for a fragment may result in a fault struct dict instead of a list.
# We extract GIDs from successful calls.
gids = [result[0] for result in results if isinstance(result, list) and result]
if len(gids) != len(fragments):
failed_count = len(fragments) - len(gids)
logger.warning(f"{failed_count} out of {len(fragments)} fragments failed to be added to aria2c.")
if not gids:
logger.error("Failed to add any fragments to aria2c. All submissions failed.")
return 1
logger.info(f"Successfully added {len(gids)} fragments to aria2c.")
if args.verbose:
logger.debug(f"GIDs: {gids}")
if args.wait:
logger.info(f"Waiting for {len(gids)} fragments to complete...")
start_time = time.time()
downloads_to_cleanup = []
try:
while True:
if timeout_seconds and (time.time() - start_time > timeout_seconds):
raise TimeoutError(f"Fragment downloads did not complete within {timeout_seconds}s timeout.")
downloads = api.get_downloads(gids)
downloads_to_cleanup = downloads # Store for potential cleanup
# A download is considered "active" if it's currently downloading or waiting in the queue.
# It is "not active" if it is complete, errored, paused, or removed.
active_downloads = [d for d in downloads if d.status in ('active', 'waiting')]
if not active_downloads:
break # All downloads are complete or have stopped for other reasons
for d in active_downloads:
d.update()
completed_count = len(downloads) - len(active_downloads)
total_bytes = sum(d.total_length for d in downloads)
downloaded_bytes = sum(d.completed_length for d in downloads)
total_speed = sum(d.download_speed for d in downloads)
progress_percent = (downloaded_bytes / total_bytes * 100) if total_bytes > 0 else 0
progress_info = (
f"\rProgress: {completed_count}/{len(downloads)} fragments | "
f"{progress_percent:.1f}% "
f"({human_readable_bytes(downloaded_bytes)}/{human_readable_bytes(total_bytes)}) "
f"Speed: {human_readable_bytes(total_speed)}/s"
)
sys.stdout.write(progress_info)
sys.stdout.flush()
time.sleep(0.5)
except (KeyboardInterrupt, TimeoutError) as e:
sys.stdout.write('\n')
if isinstance(e, KeyboardInterrupt):
logger.warning("Wait interrupted by user. Cleaning up fragments...")
cleanup_aria_download(api, downloads_to_cleanup)
return 130
else: # TimeoutError
logger.error(f"Download timed out. Cleaning up fragments... Error: {e}")
cleanup_aria_download(api, downloads_to_cleanup)
return 1
except aria2p.ClientException as e:
# This can happen if downloads complete and are removed by aria2c
# before we can check their final status. Assume success in this case.
logger.warning(f"Could not get final status for some fragments (maybe removed on completion?): {e}. Assuming success.")
sys.stdout.write('\n')
# Final status check
failed_downloads = []
try:
downloads = api.get_downloads(gids)
failed_downloads = [d for d in downloads if d.status != 'complete']
except aria2p.ClientException as e:
logger.warning(f"Could not perform final status check for fragments (maybe removed on completion?): {e}. Assuming success.")
# If we can't check, we assume success based on the earlier wait loop not failing catastrophically.
failed_downloads = []
if failed_downloads:
logger.error(f"{len(failed_downloads)} fragments failed to download.")
for d in failed_downloads:
detailed_error = parse_aria_error(d)
logger.error(f" GID {d.gid}: {detailed_error}")
return 1
else:
logger.info("All fragments downloaded successfully.")
output_dir = args.output_dir or '.'
final_filepath = os.path.join(output_dir, filename)
fragments_lookup_dir = args.fragments_dir or output_dir
if args.auto_merge_fragments:
logger.info(f"Attempting to merge fragments into: {final_filepath}")
logger.info(f"Searching for fragments in local directory: {os.path.abspath(fragments_lookup_dir)}")
try:
# base_filename and file_ext are available from earlier in the function
# We must escape the base filename in case it contains glob special characters like [ or ].
escaped_base = glob.escape(base_filename)
search_path = os.path.join(fragments_lookup_dir, f"{escaped_base}-Frag*{file_ext}")
fragment_files = glob.glob(search_path)
if not fragment_files:
logger.error(f"No fragment files found with pattern: {search_path}")
return 1
def fragment_sort_key(f):
match = re.search(r'Frag(\d+)', os.path.basename(f))
return int(match.group(1)) if match else -1
fragment_files.sort(key=fragment_sort_key)
with open(final_filepath, 'wb') as dest_file:
for frag_path in fragment_files:
with open(frag_path, 'rb') as src_file:
shutil.copyfileobj(src_file, dest_file)
logger.info(f"Successfully merged {len(fragment_files)} fragments into {final_filepath}")
if args.remove_fragments_after_merge or args.cleanup:
logger.info("Removing fragment files...")
for frag_path in fragment_files:
os.remove(frag_path)
logger.info("Fragment files removed.")
if args.cleanup:
try:
os.remove(final_filepath)
logger.info(f"Cleanup: Removed merged file '{final_filepath}'")
except OSError as e:
logger.error(f"Cleanup failed: Could not remove merged file '{final_filepath}': {e}")
print(f"Download and merge successful: {final_filepath}")
if args.purge_on_complete:
try:
api.purge_download_result()
logger.info("Purged all completed/failed downloads from aria2c history.")
except Exception as e:
logger.warning(f"Failed to purge download history: {e}")
elif args.remove_on_complete:
try:
# The `downloads` variable from the last status check should be valid here.
api.remove_download_result(downloads)
logger.info(f"Removed {len(downloads)} fragment downloads from aria2c history.")
except aria2p.ClientException as e:
logger.warning(f"Could not remove fragment downloads from history (maybe already gone?): {e}")
except Exception as e:
logger.warning(f"Failed to remove fragment downloads from history: {e}")
return 0
except Exception as e:
logger.error(f"An error occurred during merging: {e}", exc_info=args.verbose)
logger.error("Fragments were downloaded but not merged.")
return 1
else:
print("Download successful. Fragments now need to be merged manually.")
print(f"The final merged file should be named: {final_filepath}")
print("You can merge them with a command like:")
print(f" cat `ls -v '{os.path.join(fragments_lookup_dir, base_filename)}'-Frag*'{file_ext}'` > '{final_filepath}'")
if args.cleanup:
logger.info("Cleanup requested. Removing downloaded fragments...")
try:
# base_filename and file_ext are available from earlier in the function
escaped_base = glob.escape(base_filename)
search_path = os.path.join(fragments_lookup_dir, f"{escaped_base}-Frag*{file_ext}")
fragment_files = glob.glob(search_path)
if not fragment_files:
logger.warning(f"Cleanup: No fragment files found with pattern: {search_path}")
else:
for frag_path in fragment_files:
os.remove(frag_path)
logger.info(f"Removed {len(fragment_files)} fragment files.")
except Exception as e:
logger.error(f"An error occurred during fragment cleanup: {e}", exc_info=args.verbose)
if args.purge_on_complete:
try:
api.purge_download_result()
logger.info("Purged all completed/failed downloads from aria2c history.")
except Exception as e:
logger.warning(f"Failed to purge download history: {e}")
elif args.remove_on_complete:
try:
# The `downloads` variable from the last status check should be valid here.
api.remove_download_result(downloads)
logger.info(f"Removed {len(downloads)} fragment downloads from aria2c history.")
except aria2p.ClientException as e:
logger.warning(f"Could not remove fragment downloads from history (maybe already gone?): {e}")
except Exception as e:
logger.warning(f"Failed to remove fragment downloads from history: {e}")
return 0
else:
print(f"Successfully added {len(gids)} fragments. GIDs: {gids}")
print("These fragments will need to be merged manually after download.")
return 0

View File

@ -0,0 +1,297 @@
#!/usr/bin/env python3
"""
Tool to download a specified format using yt-dlp as a Python library.
"""
import argparse
import contextlib
import io
import json
import logging
import os
import re
import shlex
import sys
import time
from datetime import datetime
try:
import yt_dlp
except ImportError:
print("yt-dlp is not installed. Please install it with: pip install yt-dlp", file=sys.stderr)
sys.exit(1)
logger = logging.getLogger('download_native_py_tool')
# A custom logger for yt-dlp to capture output and key events
class YTDLPLogger:
def __init__(self):
self.final_filename = None
self.is_403 = False
self.is_timeout = False
def debug(self, msg):
# yt-dlp logs the destination file path at the debug level.
if msg.startswith('[download] Destination:'):
self.final_filename = msg.split(':', 1)[1].strip()
elif msg.startswith('[download]') and 'has already been downloaded' in msg:
match = re.search(r'\[download\]\s+(.*)\s+has already been downloaded', msg)
if match:
self.final_filename = match.group(1).strip()
logger.debug(msg)
def info(self, msg):
logger.info(msg)
def warning(self, msg):
logger.warning(msg)
def error(self, msg):
if "HTTP Error 403" in msg:
self.is_403 = True
if "Read timed out" in msg:
self.is_timeout = True
logger.error(msg)
def ytdlp_progress_hook(d, ytdlp_logger):
"""Progress hook to capture the final filename."""
if d['status'] == 'finished':
ytdlp_logger.final_filename = d.get('filename')
logger.info(f"Download finished. Final file: {ytdlp_logger.final_filename}")
def add_download_native_py_parser(subparsers):
"""Add the parser for the 'download py' command."""
parser = subparsers.add_parser(
'py',
description='Download using yt-dlp as a Python library (recommended). This method calls yt-dlp functions directly.',
formatter_class=argparse.RawTextHelpFormatter,
help='Download using a direct Python call to yt-dlp (recommended).'
)
parser.add_argument('--load-info-json', type=argparse.FileType('r', encoding='utf-8'), help="Path to the info.json file. If not provided, reads from stdin.")
parser.add_argument('-f', '--format', required=True, help='The format selection string to download (e.g., "18", "299/137", "bestvideo+bestaudio").')
parser.add_argument('--output-dir', default='.', help='Directory to save the downloaded file. Defaults to current directory.')
parser.add_argument('--save-info-json-dir', help='If specified, save the info.json received from stdin to this directory with an auto-generated name.')
parser.add_argument('--proxy', help='Proxy to use for the download, e.g., "socks5://127.0.0.1:1080".')
parser.add_argument('--proxy-rename', help='Apply sed-style regex substitution to the proxy URL. Format: s/pattern/replacement/')
parser.add_argument('--temp-path', help='Directory for temporary files (e.g., fragments). Use a RAM disk for best performance.')
parser.add_argument('--pause', type=int, default=0, help='Seconds to wait before starting the download.')
parser.add_argument('--download-continue', action='store_true', help='Enable download continuation (--no-overwrites and --continue flags for yt-dlp).')
parser.add_argument('--verbose', action='store_true', help='Enable verbose output for this script and yt-dlp.')
parser.add_argument('--cli-config', help='Path to a yt-dlp configuration file to load.')
parser.add_argument('--downloader', help='Name of the external downloader backend for yt-dlp to use (e.g., "aria2c", "native").')
parser.add_argument('--downloader-args', help='Arguments to pass to the external downloader backend (e.g., "aria2c:-x 8").')
parser.add_argument('--extra-ytdlp-args', help='A string of extra command-line arguments to pass to yt-dlp.')
parser.add_argument('--output-buffer', action='store_true', help='Download to an in-memory buffer and print raw bytes to stdout. Final filename is printed to stderr.')
parser.add_argument('--cleanup', action='store_true', help='After download, rename the file to include a timestamp and truncate it to 0 bytes.')
parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.')
return parser
def main_download_native_py(args):
"""Main logic for the 'download-native-py' command."""
# If outputting to buffer, all logging must go to stderr to keep stdout clean for binary data.
log_stream = sys.stderr if args.output_buffer else sys.stdout
log_level = logging.DEBUG if args.verbose else logging.INFO
# Reconfigure root logger
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
logging.basicConfig(level=log_level, stream=log_stream, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
if args.pause > 0:
logger.info(f"Pausing for {args.pause} seconds...")
time.sleep(args.pause)
info_json_content = ""
input_source_name = ""
if args.load_info_json:
info_json_content = args.load_info_json.read()
input_source_name = args.load_info_json.name
else:
info_json_content = sys.stdin.read()
input_source_name = "stdin"
if not info_json_content.strip():
logger.error(f"Failed to read info.json from {input_source_name}. Input is empty.")
return 1
try:
info_data = json.loads(info_json_content)
logger.info(f"Successfully loaded info.json from {input_source_name}.")
except json.JSONDecodeError:
logger.error(f"Failed to parse info.json from {input_source_name}. Is the input valid JSON?")
return 1
if args.save_info_json_dir:
try:
video_id = info_data.get('id', 'unknown_video_id')
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"{timestamp}-{video_id}-info.json"
output_path = os.path.join(args.save_info_json_dir, filename)
os.makedirs(args.save_info_json_dir, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(info_data, f, indent=2)
logger.info(f"Saved info.json to {output_path}")
except Exception as e:
logger.error(f"Failed to save info.json: {e}")
# Handle proxy and proxy rename
proxy_url = args.proxy
if not proxy_url:
proxy_url = info_data.get('_proxy_url')
if proxy_url:
logger.info(f"Using proxy from info.json: {proxy_url}")
if proxy_url and args.proxy_rename:
rename_rule = args.proxy_rename.strip("'\"")
if rename_rule.startswith('s/') and rename_rule.count('/') >= 2:
try:
parts = rename_rule.split('/')
pattern, replacement = parts[1], parts[2]
original_proxy = proxy_url
proxy_url = re.sub(pattern, replacement, proxy_url)
logger.info(f"Renamed proxy URL from '{original_proxy}' to '{proxy_url}' using rule '{rename_rule}'")
except re.error as e:
logger.error(f"Invalid regex in --proxy-rename: {e}")
return 1
else:
logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/")
return 1
# Build the yt-dlp options dictionary
# Start by parsing options from config file and extra args to establish a baseline.
base_opts_args = []
if args.cli_config and os.path.exists(args.cli_config):
try:
with open(args.cli_config, 'r', encoding='utf-8') as f:
config_content = f.read()
base_opts_args.extend(shlex.split(config_content))
logger.info(f"Loaded {len(base_opts_args)} arguments from config file: {args.cli_config}")
except Exception as e:
logger.error(f"Failed to read or parse config file {args.cli_config}: {e}")
return 1
elif args.cli_config:
logger.warning(f"Config file '{args.cli_config}' not found. Ignoring.")
if args.extra_ytdlp_args:
extra_args_list = shlex.split(args.extra_ytdlp_args)
logger.info(f"Adding {len(extra_args_list)} extra arguments from --extra-ytdlp-args.")
base_opts_args.extend(extra_args_list)
ydl_opts = {}
if base_opts_args:
try:
# This is an internal API, but it's the most accurate way to parse CLI args
# into the ydl_opts dictionary format.
ydl_opts, _, _ = yt_dlp.parse_options(base_opts_args)
except Exception as e:
logger.error(f"Failed to parse options from config/extra_args: {e}")
return 1
# Now, layer the script's explicit arguments on top, as they have higher precedence.
os.makedirs(args.output_dir, exist_ok=True)
output_template = os.path.join(args.output_dir, '%(title)s [%(id)s].f%(format_id)s.%(ext)s')
ytdlp_logger = YTDLPLogger()
# Use update to merge, so explicit args overwrite config/extra args.
ydl_opts.update({
'format': args.format,
'outtmpl': '-' if args.output_buffer else output_template,
'logger': ytdlp_logger,
'progress_hooks': [lambda d: ytdlp_progress_hook(d, ytdlp_logger)],
'verbose': args.verbose,
})
if args.temp_path:
ydl_opts['paths'] = {'temp': args.temp_path}
logger.info(f"Using temporary path: {args.temp_path}")
if args.download_continue:
ydl_opts['continuedl'] = True
ydl_opts['nooverwrites'] = True
if proxy_url:
ydl_opts['proxy'] = proxy_url
if args.downloader:
ydl_opts['downloader'] = {args.downloader: None}
if args.downloader_args:
# yt-dlp expects a dict for downloader_args
# e.g., {'aria2c': ['-x', '8']}
try:
downloader_name, args_str = args.downloader_args.split(':', 1)
ydl_opts.setdefault('downloader_args', {})[downloader_name] = shlex.split(args_str)
except ValueError:
logger.error(f"Invalid --downloader-args format. Expected 'downloader:args'. Got: '{args.downloader_args}'")
return 1
if args.merge_output_format:
ydl_opts['merge_output_format'] = args.merge_output_format
try:
logger.info(f"Starting download for format '{args.format}' using yt-dlp library...")
download_buffer = None
if args.output_buffer:
# When downloading to buffer, we redirect stdout to capture the binary data.
download_buffer = io.BytesIO()
ctx_mgr = contextlib.redirect_stdout(download_buffer)
else:
# Otherwise, use a null context manager.
ctx_mgr = contextlib.nullcontext()
with ctx_mgr, yt_dlp.YoutubeDL(ydl_opts) as ydl:
# The download() method is for URLs. For a pre-fetched info dict,
# we must use process_ie_result to bypass the info extraction step.
# It raises DownloadError on failure, which is caught by the outer try...except block.
ydl.process_ie_result(info_data)
# If process_ie_result completes without an exception, the download was successful.
retcode = 0
# The success path is now always taken if no exception was raised.
if retcode == 0:
logger.info("yt-dlp download completed successfully.")
if args.output_buffer:
# Write the captured binary data to the actual stdout.
sys.stdout.buffer.write(download_buffer.getvalue())
sys.stdout.buffer.flush()
# Print the filename to stderr for the orchestrator.
if ytdlp_logger.final_filename:
print(ytdlp_logger.final_filename, file=sys.stderr)
else:
# Print the filename to stdout as usual.
if ytdlp_logger.final_filename:
print(ytdlp_logger.final_filename, file=sys.stdout)
if args.cleanup:
downloaded_filepath = ytdlp_logger.final_filename
if downloaded_filepath and os.path.exists(downloaded_filepath):
try:
logger.info(f"Cleanup: Renaming and truncating '{downloaded_filepath}'")
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
directory, original_filename = os.path.split(downloaded_filepath)
filename_base, filename_ext = os.path.splitext(original_filename)
new_filename = f"{filename_base}_{timestamp}{filename_ext}.empty"
new_filepath = os.path.join(directory, new_filename)
os.rename(downloaded_filepath, new_filepath)
logger.info(f"Renamed to '{new_filepath}'")
with open(new_filepath, 'w') as f:
pass
logger.info(f"Truncated '{new_filepath}' to 0 bytes.")
except Exception as e:
logger.error(f"Cleanup failed: {e}")
return 1 # Treat cleanup failure as a script failure
elif not args.output_buffer:
logger.warning("Cleanup requested, but no downloaded file was found. Skipping cleanup.")
return 0
else:
logger.error(f"yt-dlp download failed with internal exit code {retcode}.")
return 1
except yt_dlp.utils.DownloadError as e:
# This catches download-specific errors from yt-dlp
logger.error(f"yt-dlp DownloadError: {e}")
return 1
except Exception as e:
logger.exception(f"An unexpected error occurred during yt-dlp execution: {e}")
return 1

View File

@ -0,0 +1,285 @@
#!/usr/bin/env python3
"""
Tool to download a specified format using an info.json from stdin.
"""
import argparse
import json
import logging
import os
import re
import shlex
import subprocess
import sys
import tempfile
import time
from datetime import datetime
# Configure logging
logger = logging.getLogger('download_tool')
def add_download_parser(subparsers):
"""Add the parser for the 'download cli' command."""
parser = subparsers.add_parser(
'cli',
description='Download using the legacy yt-dlp CLI wrapper. This method invokes yt-dlp as a subprocess.',
formatter_class=argparse.RawTextHelpFormatter,
help='Download using the legacy yt-dlp CLI wrapper.'
)
parser.add_argument('--load-info-json', type=argparse.FileType('r', encoding='utf-8'), help="Path to the info.json file. If not provided, reads from stdin.")
parser.add_argument('-f', '--format', required=True, help='The format selection string to download (e.g., "18", "299/137", "bestvideo+bestaudio").')
parser.add_argument('--output-dir', default='.', help='Directory to save the downloaded file. Defaults to current directory.')
parser.add_argument('--save-info-json-dir', help='If specified, save the info.json received from stdin to this directory with an auto-generated name.')
parser.add_argument('--proxy', help='Proxy to use for the download, e.g., "socks5://127.0.0.1:1080". This option sets the proxy, overriding any value from the info.json.')
parser.add_argument('--proxy-rename', help='Apply sed-style regex substitution to the proxy URL. Format: s/pattern/replacement/')
parser.add_argument('--pause', type=int, default=0, help='Seconds to wait before starting the download.')
parser.add_argument('--print-traffic', action='store_true', help='Print traffic instead of a progress bar.')
parser.add_argument('--download-continue', action='store_true', help='Enable download continuation (--continue and --part flags for yt-dlp).')
parser.add_argument('--verbose', action='store_true', help='Enable verbose output for this script and yt-dlp.')
parser.add_argument('--cli-config', default='cli.config', help='Path to a yt-dlp configuration file. Defaults to "cli.config".')
parser.add_argument('--cleanup', action='store_true', help='After download, rename the file to include a timestamp and truncate it to 0 bytes.')
parser.add_argument('--log-file', help='Append full yt-dlp output to the specified log file.')
parser.add_argument('--yt-dlp-path', default='yt-dlp', help='Path to the yt-dlp executable. Defaults to "yt-dlp" in PATH.')
parser.add_argument('--extra-ytdlp-args', help='A string of extra command-line arguments to pass to yt-dlp.')
parser.add_argument('--downloader', help='Name of the external downloader to use (e.g., "aria2c", "native").')
parser.add_argument('--downloader-args', help='Arguments to pass to the external downloader (e.g., "aria2c:-x 8").')
parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.')
return parser
def main_download(args):
"""Main logic for the 'download' command."""
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
if args.pause > 0:
logger.info(f"Pausing for {args.pause} seconds...")
time.sleep(args.pause)
info_json_content = ""
input_source_name = ""
if args.load_info_json:
info_json_content = args.load_info_json.read()
input_source_name = args.load_info_json.name
else:
info_json_content = sys.stdin.read()
input_source_name = "stdin"
if not info_json_content.strip():
logger.error(f"Failed to read info.json from {input_source_name}. Input is empty.")
return 1
try:
info_data = json.loads(info_json_content)
logger.info(f"Successfully loaded info.json from {input_source_name}.")
except json.JSONDecodeError:
logger.error(f"Failed to parse info.json from {input_source_name}. Is the input valid JSON?")
return 1
if args.save_info_json_dir:
try:
video_id = info_data.get('id', 'unknown_video_id')
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"{timestamp}-{video_id}-info.json"
output_path = os.path.join(args.save_info_json_dir, filename)
os.makedirs(args.save_info_json_dir, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(info_data, f, indent=2)
logger.info(f"Saved info.json to {output_path}")
except Exception as e:
logger.error(f"Failed to save info.json: {e}")
# Determine proxy to use
proxy_url = args.proxy
if not proxy_url:
proxy_url = info_data.get('_proxy_url')
if proxy_url:
logger.info(f"Using proxy from info.json: {proxy_url}")
if proxy_url and args.proxy_rename:
rename_rule = args.proxy_rename
# The user's command line might include quotes that are preserved by shlex.
# Strip them to get the raw rule.
rename_rule = rename_rule.strip("'\"")
if rename_rule.startswith('s/') and rename_rule.count('/') >= 2:
try:
parts = rename_rule.split('/')
pattern = parts[1]
replacement = parts[2]
original_proxy = proxy_url
proxy_url = re.sub(pattern, replacement, proxy_url)
logger.info(f"Renamed proxy URL from '{original_proxy}' to '{proxy_url}' using rule '{rename_rule}'")
except re.error as e:
logger.error(f"Invalid regex in --proxy-rename: {e}")
return 1
except IndexError:
logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/")
return 1
else:
logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/")
return 1
# yt-dlp needs to load the info.json from a file
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', encoding='utf-8') as tmp:
json.dump(info_data, tmp)
info_json_path = tmp.name
logger.debug(f"Temporarily saved info.json to {info_json_path}")
downloaded_filepath = None
return_code = 1 # Default to error
try:
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
output_template = os.path.join(args.output_dir, '%(title)s [%(id)s].f%(format_id)s.%(ext)s')
cmd = [
args.yt_dlp_path,
'--load-info-json', info_json_path,
'-f', args.format,
'-o', output_template,
'--print', 'filename',
]
if args.extra_ytdlp_args:
cmd.extend(shlex.split(args.extra_ytdlp_args))
if args.downloader:
cmd.extend(['--downloader', args.downloader])
if args.downloader_args:
cmd.extend(['--downloader-args', args.downloader_args])
if args.merge_output_format:
cmd.extend(['--merge-output-format', args.merge_output_format])
if args.download_continue:
cmd.extend(['--continue', '--part'])
if os.path.exists(args.cli_config):
logger.info(f"Using config file: {args.cli_config}")
cmd.extend(['--config-location', args.cli_config])
else:
logger.info(f"Config file '{args.cli_config}' not found. Using yt-dlp defaults.")
if args.print_traffic:
cmd.append('--print-traffic')
cmd.append('--no-progress')
else:
cmd.append('--progress')
if args.verbose:
cmd.append('--verbose')
if proxy_url:
cmd.extend(['--proxy', proxy_url])
# Determine if we need to capture output.
capture_output = args.cleanup or args.log_file or args.print_traffic
if capture_output and not args.print_traffic:
logger.info("Note: --cleanup or --log-file requires capturing output, which may affect progress bar display.")
logger.info(f"Executing yt-dlp command for format '{args.format}'")
# Construct a display version of the command for logging
display_cmd_str = ' '.join(f"'{arg}'" if ' ' in arg else arg for arg in cmd)
if os.path.exists(args.cli_config):
try:
with open(args.cli_config, 'r', encoding='utf-8') as f:
config_contents = ' '.join(f.read().split())
if config_contents:
logger.info(f"cli.config contents: {config_contents}")
except IOError as e:
logger.warning(f"Could not read config file {args.cli_config}: {e}")
logger.info(f"Full command: {display_cmd_str}")
if capture_output:
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='utf-8')
log_f = None
if args.log_file:
try:
log_f = open(args.log_file, 'a', encoding='utf-8')
log_f.write(f"\n--- Log entry: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ---\n")
log_f.write(f"Command: {' '.join(cmd)}\n\n")
except IOError as e:
logger.error(f"Failed to open log file {args.log_file}: {e}")
stdout_data, stderr_data = process.communicate()
return_code = process.returncode
# Write captured output to terminal and log file
if stdout_data:
sys.stdout.write(stdout_data)
sys.stdout.flush()
if log_f:
for line in stdout_data.splitlines(True):
log_f.write(f"[stdout] {line}")
if stderr_data:
sys.stderr.write(stderr_data)
sys.stderr.flush()
if log_f:
for line in stderr_data.splitlines(True):
log_f.write(f"[stderr] {line}")
stdout_lines = stdout_data.splitlines() if stdout_data else []
if log_f:
log_f.write(f"\n--- End log entry (yt-dlp exit code: {return_code}) ---\n")
log_f.close()
for line in reversed(stdout_lines):
if line and os.path.exists(line):
downloaded_filepath = line
logger.info(f"Detected downloaded file: {downloaded_filepath}")
break
else:
# Original behavior: progress bar direct to terminal, no capture
process = subprocess.Popen(cmd)
process.wait()
return_code = process.returncode
if return_code != 0:
logger.error(f"yt-dlp exited with error code {return_code}")
else:
logger.info("yt-dlp command completed successfully.")
except Exception as e:
logger.exception(f"An unexpected error occurred: {e}")
return 1
finally:
# Clean up the temporary file
if os.path.exists(info_json_path):
os.unlink(info_json_path)
logger.debug(f"Removed temporary file {info_json_path}")
# Cleanup phase
if args.cleanup:
if downloaded_filepath and os.path.exists(downloaded_filepath):
try:
logger.info(f"Cleanup: Renaming and truncating '{downloaded_filepath}'")
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
directory, original_filename = os.path.split(downloaded_filepath)
filename_base, filename_ext = os.path.splitext(original_filename)
# New name format is [base]_[timestamp][ext].empty
new_filename = f"{filename_base}_{timestamp}{filename_ext}.empty"
new_filepath = os.path.join(directory, new_filename)
os.rename(downloaded_filepath, new_filepath)
logger.info(f"Renamed to '{new_filepath}'")
with open(new_filepath, 'w') as f:
pass
logger.info(f"Truncated '{new_filepath}' to 0 bytes.")
except Exception as e:
logger.error(f"Cleanup failed: {e}")
return 1
else:
logger.warning("Cleanup requested, but no downloaded file was found. Skipping cleanup.")
return return_code

View File

@ -0,0 +1,473 @@
#!/usr/bin/env python3
"""
Tool to get info.json from the Thrift service.
"""
import argparse
import json
import os
import re
import sys
import logging
import codecs
from datetime import datetime
from typing import Dict, Any, Optional
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# Note: The CLI entrypoint will configure the root logger.
# We get our own logger here for namespacing.
logger = logging.getLogger('get_info_tool')
# Import Thrift modules
# Add project's thrift gen_py path to allow importing 'pangramia'
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.abspath(os.path.join(script_dir, '..'))
sys.path.insert(0, os.path.join(project_root, 'thrift_model', 'gen_py'))
from thrift.transport import TTransport
from pangramia.yt.common.ttypes import TokenUpdateMode
from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
from yt_ops_services.client_utils import get_thrift_client
from ytops_client.request_params_help import REQUEST_PARAMS_HELP_STRING
def get_video_id(url: str) -> str:
"""Extracts a YouTube video ID from a URL."""
# For URLs like https://www.youtube.com/watch?v=VIDEO_ID
match = re.search(r"v=([0-9A-Za-z_-]{11})", url)
if match:
return match.group(1)
# For URLs like https://youtu.be/VIDEO_ID
match = re.search(r"youtu\.be\/([0-9A-Za-z_-]{11})", url)
if match:
return match.group(1)
# For plain video IDs
if re.fullmatch(r'[0-9A-Za-z_-]{11}', url):
return url
return "unknown_video_id"
def parse_key_value_params(params_str: str) -> Dict[str, Any]:
"""Parses a comma-separated string of key=value pairs into a nested dict."""
params = {}
if not params_str:
return params
for pair in params_str.split(','):
if '=' not in pair:
logger.warning(f"Skipping malformed parameter pair: {pair}")
continue
key, value_str = pair.split('=', 1)
keys = key.strip().split('.')
# Try to parse value as JSON primitive, otherwise treat as string
try:
# Don't parse if it's quoted, treat as string
if (value_str.startswith('"') and value_str.endswith('"')) or \
(value_str.startswith("'") and value_str.endswith("'")):
value = value_str[1:-1]
else:
value = json.loads(value_str)
except json.JSONDecodeError:
value = value_str
d = params
for k in keys[:-1]:
if k not in d or not isinstance(d[k], dict):
d[k] = {}
d = d[k]
d[keys[-1]] = value
return params
def add_get_info_parser(subparsers):
"""Add the parser for the 'get-info' command."""
parser = subparsers.add_parser(
'get-info',
description='Get info.json from Thrift service',
formatter_class=argparse.RawTextHelpFormatter,
help='Get info.json from the Thrift service.'
)
parser.add_argument('url', help='YouTube URL or video ID')
parser.add_argument('--host', default='127.0.0.1', help="Thrift server host. Using 127.0.0.1 avoids harmless connection errors when the local Envoy proxy only listens on IPv4.")
parser.add_argument('--port', type=int, default=9080, help='Thrift server port')
parser.add_argument('--auth-host', help='Thrift server host (overrides --host).')
parser.add_argument('--auth-port', type=int, help='Thrift server port (overrides --port).')
parser.add_argument('--profile', default='default_profile', help='The profile name (accountId) to use for the request.')
parser.add_argument('--client', help='''Specific client to use. Overrides server default.
Available clients:
web, web_safari, web_embedded, web_music, web_creator, mweb
android, android_music, android_creator, android_vr
ios, ios_music, ios_creator
tv, tv_simply, tv_embedded
Append "_camoufox" to any client name (e.g., "web_camoufox") to force
the browser-based generation strategy.''')
parser.add_argument('--output', help='Output file path for the info.json. If not provided, prints to stdout.')
parser.add_argument('--output-auto', action='store_true', help='Automatically generate output filename for info.json and invocation data. Format: DATETIME-CLIENT-VIDEOID-info.json')
parser.add_argument('--output-auto-url-only', action='store_true', help='Automatically generate output filename for info.json (format: VIDEOID-info.json) and also save a copy to latest-info.json.')
parser.add_argument('--output-auto-suffix', help='Suffix to add to the filename before "-info.json" when using --output-auto or --output-auto-url-only. E.g., "-cycle1".')
parser.add_argument('--log-file-auto', action='store_true', help='Automatically generate a log filename and save all script logs to it. Format: VIDEOID-DATETIME.log')
parser.add_argument('--machine-id', help='Identifier for the client machine. Defaults to hostname.')
parser.add_argument('--worker-id', help='Identifier for a worker process. Used for naming files with --save-latest.')
parser.add_argument('--save-latest', action='store_true', help='Save a copy of the info.json to latest-info.json or [worker-id]-latest-info.json. This is implied by --output-auto-url-only.')
parser.add_argument('--assigned-proxy-url', help='A specific proxy URL to use for the request, overriding the server\'s proxy pool logic.')
parser.add_argument('--proxy-rename', help='Apply sed-style regex substitution to the assigned proxy URL. Format: s/pattern/replacement/')
parser.add_argument('--print-proxy', action='store_true', help='Print the proxy used for the request to stderr.')
parser.add_argument('--verbose', action='store_true', help='Enable verbose output')
parser.add_argument('--log-return', action='store_true', help='Log the full summary of the thrift response to stderr, including detailed logs.\nThis is a convenience flag that implies --show-prefetch-log, --show-nodejs-log, and --show-ytdlp-log.')
parser.add_argument('--show-prefetch-log', action='store_true', help='Print the curl pre-fetch log from the server response.')
parser.add_argument('--show-nodejs-log', action='store_true', help='Print the Node.js debug log from the server response.')
parser.add_argument('--show-ytdlp-log', action='store_true', help='Print the yt-dlp debug log from the server response.')
parser.add_argument('--direct', action='store_true', help='Use the direct yt-dlp info.json generation method, bypassing Node.js token generation.')
parser.add_argument('--print-info-out', action='store_true', help='Print the final info.json to stdout. By default, output is suppressed unless writing to a file.')
parser.add_argument('--request-params-json', help=REQUEST_PARAMS_HELP_STRING + '\nCan also be a comma-separated string of key=value pairs (e.g., "caching_policy.mode=force_refresh").')
parser.add_argument('--force-renew', help='Comma-separated list of items to force-renew: cookies, visitor_id, po_token, nsig_cache, all.')
return parser
def main_get_info(args):
"""Main logic for the 'get-info' command."""
exit_code = 0
# Set log level
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
if args.log_file_auto:
video_id = get_video_id(args.url)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_filename = f"{video_id}-{timestamp}.log"
# Get root logger to add file handler
root_logger = logging.getLogger()
file_handler = logging.FileHandler(log_filename)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
root_logger.addHandler(file_handler)
logger.info(f"Logging to file: {log_filename}")
transport = None
try:
# Determine host and port, giving precedence to --auth-* args
host = args.auth_host or args.host
port = args.auth_port or args.port
# Create Thrift client
client, transport = get_thrift_client(host, port)
# Get token data, which includes the info.json
if args.direct:
logger.info(f"Requesting info.json for URL '{args.url}' using DIRECT method.")
if args.client:
logger.info(f"Requesting to use specific client(s): {args.client}")
else:
logger.info("No specific client requested, server will let yt-dlp decide.")
token_data = client.getInfoJsonDirect(url=args.url, clients=args.client)
else:
logger.info(f"Requesting info.json for URL '{args.url}' using profile '{args.profile}'")
# Prepare arguments for the Thrift call
machine_id = args.machine_id
if not machine_id:
import socket
machine_id = socket.gethostname()
logger.info(f"No machine ID provided, using hostname: {machine_id}")
request_params = {}
if args.request_params_json:
try:
request_params = json.loads(args.request_params_json)
except json.JSONDecodeError:
logger.info("Could not parse --request-params-json as JSON, trying as key-value string.")
request_params = parse_key_value_params(args.request_params_json)
if args.force_renew:
items_to_renew = [item.strip() for item in args.force_renew.split(',')]
request_params['force_renew'] = items_to_renew
logger.info(f"Requesting force renew for: {items_to_renew}")
if args.verbose:
# Add verbose flag for yt-dlp on the server
ytdlp_params = request_params.setdefault('ytdlp_params', {})
ytdlp_params['verbose'] = True
logger.info("Verbose mode enabled, requesting verbose yt-dlp logs from server.")
thrift_args = {
'accountId': args.profile,
'updateType': TokenUpdateMode.AUTO,
'url': args.url,
'clients': args.client,
'machineId': machine_id,
'airflowLogContext': None,
'requestParamsJson': json.dumps(request_params) if request_params else None,
'assignedProxyUrl': args.assigned_proxy_url
}
# Handle proxy renaming
assigned_proxy = args.assigned_proxy_url
if assigned_proxy and args.proxy_rename:
rename_rule = args.proxy_rename.strip("'\"")
if rename_rule.startswith('s/') and rename_rule.count('/') >= 2:
try:
parts = rename_rule.split('/')
pattern = parts[1]
replacement = parts[2]
original_proxy = assigned_proxy
assigned_proxy = re.sub(pattern, replacement, assigned_proxy)
logger.info(f"Renamed proxy URL from '{original_proxy}' to '{assigned_proxy}' using rule '{rename_rule}'")
except re.error as e:
logger.error(f"Invalid regex in --proxy-rename: {e}")
return 1
except IndexError:
logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/")
return 1
else:
logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/")
return 1
thrift_args['assignedProxyUrl'] = assigned_proxy
if args.client:
logger.info(f"Requesting to use specific client: {args.client}")
else:
logger.info("No specific client requested, server will use its default.")
token_data = client.getOrRefreshToken(**thrift_args)
if args.print_proxy:
if hasattr(token_data, 'socks') and token_data.socks:
print(f"Proxy used: {token_data.socks}", file=sys.stderr)
else:
print("Proxy information not available in response.", file=sys.stderr)
if not token_data or not hasattr(token_data, 'infoJson') or not token_data.infoJson:
logger.error("Server did not return valid info.json data.")
print("Error: Server did not return valid info.json data.", file=sys.stderr)
return 1
info_json_str = token_data.infoJson
# On success, print summary info to stderr for visibility.
# This provides immediate feedback without interfering with piped stdout.
if hasattr(token_data, 'serverVersionInfo') and token_data.serverVersionInfo:
# Filter out the default params line as requested
filtered_info = '\n'.join(
line for line in token_data.serverVersionInfo.split('\n')
if 'Default yt-dlp CLI params:' not in line
)
print(f"\n--- Server Version Info ---\n{filtered_info}", file=sys.stderr)
if hasattr(token_data, 'requestSummary') and token_data.requestSummary:
try:
summary_data = json.loads(token_data.requestSummary)
print(f"\n--- Request Summary ---\n{summary_data.get('summary', token_data.requestSummary)}", file=sys.stderr)
except json.JSONDecodeError:
# Fallback for old format or non-JSON summary
print(f"\n--- Request Summary ---\n{token_data.requestSummary}", file=sys.stderr)
# Print detailed logs only if explicitly requested
if hasattr(token_data, 'requestSummary') and token_data.requestSummary:
try:
summary_data = json.loads(token_data.requestSummary)
if args.show_prefetch_log or args.log_return:
print("\n--- Prefetch Log ---", file=sys.stderr)
print(summary_data.get('prefetch_log', 'Not available.'), file=sys.stderr)
if args.show_nodejs_log or args.log_return:
print("\n--- Node.js Log ---", file=sys.stderr)
print(summary_data.get('nodejs_log', 'Not available.'), file=sys.stderr)
if args.show_ytdlp_log or args.log_return:
print("\n--- yt-dlp Log ---", file=sys.stderr)
print(summary_data.get('ytdlp_log', 'Not available.'), file=sys.stderr)
except json.JSONDecodeError:
pass # Fallback already handled above
if hasattr(token_data, 'communicationLogPaths') and token_data.communicationLogPaths:
logger.info("--- Communication Log Paths ---")
for log_path in token_data.communicationLogPaths:
logger.info(f" - {log_path}")
# Check if the returned info.json is an error report
try:
info_data = json.loads(info_json_str)
if hasattr(token_data, 'socks') and token_data.socks:
info_data['_proxy_url'] = token_data.socks
if isinstance(info_data, dict) and 'error' in info_data:
error_code = info_data.get('errorCode', 'N/A')
error_message = info_data.get('message', info_data.get('error', 'Unknown error'))
logger.error(f"Server returned an error in info.json (Code: {error_code}): {error_message}")
print(f"Error from server (Code: {error_code}): {error_message}", file=sys.stderr)
# Optionally print the full error JSON
if args.verbose:
print(json.dumps(info_data, indent=2), file=sys.stderr)
exit_code = 1
except json.JSONDecodeError:
logger.error(f"Failed to parse info.json from server: {info_json_str[:200]}...")
print("Error: Failed to parse the info.json response from the server.", file=sys.stderr)
return 1
logger.info(f"Successfully retrieved info.json ({len(info_json_str)} bytes)")
# Save to latest-info.json if requested, or if using --output-auto-url-only for convenience
if args.save_latest or args.output_auto_url_only:
base_latest_filename = f"{args.worker_id}-latest" if args.worker_id else "latest"
latest_info_filename = f"{base_latest_filename}-info.json"
latest_proxy_filename = f"{base_latest_filename}-proxy.txt"
try:
with open(latest_info_filename, 'w', encoding='utf-8') as f:
json.dump(info_data, f, indent=2)
logger.info(f"Wrote info.json to {latest_info_filename}")
print(f"Successfully saved info.json to {latest_info_filename}", file=sys.stderr)
except IOError as e:
logger.error(f"Failed to write to {latest_info_filename}: {e}")
print(f"Error: Failed to write to {latest_info_filename}: {e}", file=sys.stderr)
if hasattr(token_data, 'socks') and token_data.socks:
try:
with open(latest_proxy_filename, 'w', encoding='utf-8') as f:
f.write(token_data.socks + '\n')
logger.info(f"Wrote proxy to {latest_proxy_filename}")
print(f"Successfully saved proxy to {latest_proxy_filename}", file=sys.stderr)
except IOError as e:
logger.error(f"Failed to write to {latest_proxy_filename}: {e}")
print(f"Error: Failed to write to {latest_proxy_filename}: {e}", file=sys.stderr)
# Determine output file path if auto-naming is used
output_file = args.output
if args.output_auto or args.output_auto_url_only:
video_id = get_video_id(args.url)
suffix = args.output_auto_suffix or ""
if args.output_auto:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
client_id = args.client or args.profile
base_filename = f"{timestamp}-{client_id}-{video_id}{suffix}"
output_file = f"{base_filename}-info.json"
# Save invocation data
invocation_filename = f"{base_filename}-invocation.json"
invocation_data = {}
for attr in ['ytdlpCommand', 'socks', 'jobId', 'url', 'requestSummary', 'communicationLogPaths']:
if hasattr(token_data, attr):
value = getattr(token_data, attr)
if value:
invocation_data[attr] = value
if hasattr(token_data, 'cookiesBlob') and token_data.cookiesBlob:
invocation_data['cookiesBlob'] = f"present, {len(token_data.cookiesBlob)} bytes"
else:
invocation_data['cookiesBlob'] = "not present"
try:
with open(invocation_filename, 'w', encoding='utf-8') as f:
json.dump(invocation_data, f, indent=2)
logger.info(f"Wrote invocation data to {invocation_filename}")
except IOError as e:
logger.error(f"Failed to write invocation data to {invocation_filename}: {e}")
else: # args.output_auto_url_only
output_file = f"{video_id}{suffix}-info.json"
# Write to output file if specified
if output_file:
try:
# Ensure the output directory exists before writing the file
output_dir = os.path.dirname(output_file)
if output_dir:
os.makedirs(output_dir, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
# Pretty-print the JSON to the file
json.dump(info_data, f, indent=2)
logger.info(f"Wrote info.json to {output_file}")
# Print success message to stderr to not interfere with stdout piping
print(f"Successfully saved info.json to {output_file}", file=sys.stderr)
# If --output-auto, save invocation data
if args.output_auto:
pass # The latest-info.json logic is now handled by --save-latest
except IOError as e:
logger.error(f"Failed to write to output file {output_file}: {e}")
print(f"Error: Failed to write to output file {output_file}: {e}", file=sys.stderr)
return 1
# Print the JSON to stdout if requested, to allow for piping.
if args.print_info_out:
print(json.dumps(info_data, indent=2))
return exit_code
except (PBServiceException, PBUserException) as e:
# Check for non-fatal age-gate errors. These are expected for certain videos
# and should not cause the entire stress test to fail.
is_age_gate_error = hasattr(e, 'errorCode') and e.errorCode == 'AGE_GATED_SIGN_IN'
if is_age_gate_error:
logger.warning(f"Age-gated content detected for URL '{args.url}'. Treating as a non-fatal warning.")
print(f"Warning: Age-gated content detected for '{args.url}'.", file=sys.stderr)
# To avoid breaking downstream parsers, output a valid JSON error object.
# This allows stress testers to see a 'success' (exit 0) but still know it was an age gate issue.
error_json = {
"error": "Age-gated content",
"errorCode": "AGE_GATE",
"message": "Sign in to confirm your age."
}
print(json.dumps(error_json, indent=2))
# We return success because this is not a system failure.
return 0
# Format message for better readability, ensuring newlines are handled.
message = str(e.message or '')
try:
# Attempt to decode as if it has escaped newlines (e.g., '\\n' -> '\n')
message = codecs.decode(message, 'unicode_escape')
except Exception:
# Fallback for safety, though unicode_escape is robust
message = message.replace('\\n', '\n')
# For known user-facing errors, suppress the full traceback unless verbose is explicitly on.
# The goal is to provide a clean error message for common issues.
user_facing_errors = [
"BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED",
"VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED",
"AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "GEO_RESTRICTED"
]
is_user_facing_error = hasattr(e, 'errorCode') and e.errorCode in user_facing_errors
# Only show full traceback in verbose mode AND if it's NOT a common user-facing error.
show_exc_info = args.verbose and not is_user_facing_error
logger.error(f"A Thrift error occurred: {message}", exc_info=show_exc_info)
print(f"\n--- ERROR ---", file=sys.stderr)
print(f"{message}", file=sys.stderr)
if hasattr(e, 'context') and e.context and (args.verbose or not is_user_facing_error):
print(f"\n--- CONTEXT ---", file=sys.stderr)
# The context is a dict from thrift. Pretty print it, handling newlines in values.
if isinstance(e.context, dict):
# Process each value to un-escape newlines for clean printing
processed_context = {}
for key, value in e.context.items():
try:
processed_context[key] = codecs.decode(str(value), 'unicode_escape')
except Exception:
processed_context[key] = str(value).replace('\\n', '\n')
print(json.dumps(processed_context, indent=2), file=sys.stderr)
else:
# Fallback for non-dict context
print(str(e.context), file=sys.stderr)
print("\n", file=sys.stderr)
return 1
except TTransport.TTransportException as e:
logger.error(f"Connection to server failed: {e}", exc_info=args.verbose)
print(f"Error: Connection to server at {args.host}:{args.port} failed.", file=sys.stderr)
return 1
except Exception as e:
logger.exception(f"An unexpected error occurred: {e}")
print(f"An unexpected error occurred: {e}", file=sys.stderr)
return 1
finally:
if transport and transport.isOpen():
transport.close()
logger.info("Thrift connection closed.")

View File

@ -0,0 +1,228 @@
"""
Tool to list available formats from a yt-dlp info.json file.
"""
import sys
import json
import argparse
import re
from urllib.parse import urlparse, parse_qs
from datetime import datetime, timezone
def format_size(b):
"""Format size in bytes to human-readable string."""
if b is None:
return 'N/A'
if b < 1024:
return f"{b}B"
elif b < 1024**2:
return f"{b/1024:.2f}KiB"
elif b < 1024**3:
return f"{b/1024**2:.2f}MiB"
else:
return f"{b/1024**3:.2f}GiB"
def list_formats(info_json, requested_formats_str=None, file=sys.stdout):
"""Prints a table of available formats from info.json data."""
formats = info_json.get('formats', [])
if not formats:
print("No formats found in the provided info.json.", file=file)
return
requested_formats = []
requested_order = {}
if requested_formats_str:
# Split by comma or slash, and filter out empty strings
requested_formats = [item for item in re.split(r'[,/]', requested_formats_str) if item]
requested_order = {fmt: i for i, fmt in enumerate(requested_formats)}
def sort_key(f):
fid = f.get('format_id', '')
is_requested = fid in requested_order
if is_requested:
# Sort requested formats by the order they were provided
return (False, requested_order[fid])
else:
# Sort other formats numerically by ID
return (True, int(fid) if fid.isdigit() else 999)
sorted_formats = sorted(formats, key=sort_key)
# Check if any requested formats were found
if requested_formats:
found_any = any(f.get('format_id') in requested_order for f in formats)
if not found_any:
print("WARNING: No format from list found.", file=sys.stderr)
# Header
header = "{:<6} {:<7} {:<12} {:<5} {:<18} {:<18} {:<12} {:<10} {:<20} {:<17} {:<15} {:<12} {:<12} {:<12} {:<5} {:<12} {:<12} {:<12} {:<12} {:<12}".format(
"ID", "EXT", "RESOLUTION", "FPS", "VCODEC", "ACODEC", "FILESIZE", "TBR", "URL (path)", "EXPIRE (UTC)", "IP", "ID_TOKEN", "SESS_TOKEN", "EI_TOKEN", "GIR", "BUI_TOKEN", "POT_TOKEN", "MT_TOKEN", "SIG", "LSIG"
)
print(header, file=file)
print("-" * len(header), file=file)
for f in sorted_formats:
format_id = f.get('format_id', 'N/A')
ext = f.get('ext', 'N/A')
resolution = f.get('resolution')
if not resolution:
if 'width' in f and f['width'] is not None:
resolution = f"{f['width']}x{f['height']}"
else:
resolution = 'audio only'
fps = f.get('fps', '')
vcodec = f.get('vcodec', 'none')
acodec = f.get('acodec', 'none')
filesize = f.get('filesize') or f.get('filesize_approx')
tbr = f.get('tbr')
display_id = f"*{format_id}" if format_id in requested_order else format_id
url = f.get('url', '')
partial_url, expire_date, ip, id_token_short, sess_token_short, ei_token_short, gir, bui_token_short, pot_token_short, mt_token_short, sig_short, lsig_short = ('N/A',) * 12
if url:
parsed = urlparse(url)
query_params = parse_qs(parsed.query)
path_and_query = parsed.path
if parsed.query:
path_and_query += '?' + parsed.query
if len(path_and_query) > 18:
partial_url = path_and_query[:8] + '...' + path_and_query[-7:]
else:
partial_url = path_and_query
expire_ts = query_params.get('expire', [None])[0]
if expire_ts:
try:
expire_date = datetime.fromtimestamp(int(expire_ts), timezone.utc).strftime('%m-%d %H:%M:%S')
except (ValueError, TypeError):
expire_date = 'Invalid'
ip = query_params.get('ip', ['N/A'])[0]
id_token = query_params.get('id', [None])[0]
if id_token and len(id_token) > 12:
id_token_short = id_token[:6] + '..' + id_token[-4:]
elif id_token:
id_token_short = id_token
sess_token = query_params.get('n', [None])[0]
if sess_token and len(sess_token) > 12:
sess_token_short = sess_token[:6] + '..' + sess_token[-4:]
elif sess_token:
sess_token_short = sess_token
ei_token = query_params.get('ei', [None])[0]
if ei_token and len(ei_token) > 12:
ei_token_short = ei_token[:6] + '..' + ei_token[-4:]
elif ei_token:
ei_token_short = ei_token
gir = query_params.get('gir', ['N/A'])[0]
bui_token = query_params.get('bui', [None])[0]
if bui_token and len(bui_token) > 12:
bui_token_short = bui_token[:6] + '..' + bui_token[-4:]
elif bui_token:
bui_token_short = bui_token
pot_token = query_params.get('pot', [None])[0]
if pot_token and len(pot_token) > 12:
pot_token_short = pot_token[:6] + '..' + pot_token[-4:]
elif pot_token:
pot_token_short = pot_token
mt_token = query_params.get('mt', [None])[0]
# mt is often just a timestamp, don't shorten unless it's a long hash
if mt_token and len(mt_token) > 12:
mt_token_short = mt_token[:6] + '..' + mt_token[-4:]
elif mt_token:
mt_token_short = mt_token
sig = query_params.get('sig', [None])[0]
if sig and len(sig) > 12:
sig_short = sig[:6] + '..' + sig[-4:]
elif sig:
sig_short = sig
lsig = query_params.get('lsig', [None])[0]
if lsig and len(lsig) > 12:
lsig_short = lsig[:6] + '..' + lsig[-4:]
elif lsig:
lsig_short = lsig
print("{:<6} {:<7} {:<12} {:<5} {:<18} {:<18} {:<12} {:<10} {:<20} {:<17} {:<15} {:<12} {:<12} {:<12} {:<5} {:<12} {:<12} {:<12} {:<12} {:<12}".format(
str(display_id),
str(ext),
str(resolution),
str(fps) if fps else '',
str(vcodec)[:18],
str(acodec)[:18],
format_size(filesize),
f"{tbr:.0f}k" if tbr else 'N/A',
partial_url,
expire_date,
ip,
id_token_short,
sess_token_short,
ei_token_short,
gir,
bui_token_short,
pot_token_short,
mt_token_short,
sig_short,
lsig_short
), file=file)
def add_list_formats_parser(subparsers):
"""Add the parser for the 'list-formats' command."""
parser = subparsers.add_parser(
'list-formats',
description="List available formats from a yt-dlp info.json file.",
formatter_class=argparse.RawTextHelpFormatter,
help="List available formats from a yt-dlp info.json file."
)
parser.add_argument(
'--load-info-json',
type=argparse.FileType('r', encoding='utf-8'),
default=sys.stdin,
help="Path to the info.json file. Reads from stdin if not provided."
)
parser.add_argument(
'-f', '--formats',
help='Comma or slash-separated list of format IDs to highlight and prioritize (e.g., "18,140,299/298").'
)
parser.add_argument(
'-p', '--pass-through',
action='store_true',
help='Pass the input JSON through to stdout, printing the format list to stderr.'
)
return parser
def main_list_formats(args):
"""Main logic for the 'list-formats' command."""
try:
# Read the whole content to allow passing it through
info_json_content = args.load_info_json.read()
info_data = json.loads(info_json_content)
# Determine output stream for the format list
output_stream = sys.stderr if args.pass_through else sys.stdout
list_formats(info_data, args.formats, file=output_stream)
# If pass-through is enabled, print the original JSON to stdout
if args.pass_through:
# Use end='' because the read content likely includes a trailing newline
print(info_json_content, end='')
return 0
except json.JSONDecodeError:
print("Error: Invalid JSON provided.", file=sys.stderr)
return 1
except Exception as e:
print(f"An unexpected error occurred: {e}", file=sys.stderr)
return 1

View File

@ -0,0 +1,48 @@
# Using a separate file for this long help message to keep the main script clean.
# It's imported by client tools that use the --request-params-json argument.
REQUEST_PARAMS_HELP_STRING = """JSON string with per-request parameters to override server defaults.
Example of a full configuration JSON showing default values (use single quotes to wrap it):
'{
"_comment": "This JSON object allows overriding server-side defaults for a single request.",
"cookies_file_path": "/path/to/your/cookies.txt",
"context_reuse_policy": {
"enabled": true,
"max_age_seconds": 86400,
"reuse_visitor_id": true,
"reuse_cookies": true
},
"_comment_context_reuse_policy": "Controls how the server reuses session context (cookies, visitor ID) from the account's previous successful request.",
"_comment_reuse_visitor_id": "If true, reuses the visitor ID from the last session to maintain a consistent identity to YouTube. This is automatically disabled for TV clients to avoid bot detection.",
"ytdlp_params": {
"use_curl_prefetch": false,
"skip_cache": false,
"visitor_id_override_enabled": true,
"extractor_args": {
"youtubepot-bgutilhttp": {
"base_url": "http://172.17.0.1:4416"
},
"youtube": {
"pot_trace": "true",
"formats": "duplicate",
"player_js_version": "actual"
},
"youtubepot-webpo": {
"bind_to_visitor_id": "true"
}
}
},
"_comment_ytdlp_params": "Parameters passed directly to the yt-dlp wrapper for info.json generation.",
"_comment_visitor_id_override_enabled": "If true (default), the server validates the visitor ID from the token generator and creates a new one if it is invalid. Set to false to force using the provided visitor ID without validation, which is useful for debugging.",
"_comment_extractor_args": "Directly override yt-dlp extractor arguments. To use BGUtils in script mode, replace 'youtubepot-bgutilhttp' with 'youtubepot-bgutilscript'. The script path is '/opt/bgutil-ytdlp-pot-provider-server/build/generate_once.js'. To disable any explicit provider (like '--bgutils-mode none' on the server), remove both 'youtubepot-bgutilhttp' and 'youtubepot-bgutilscript' keys.",
"session_params": {
"lang": "en-US",
"location": "US",
"deviceCategory": "MOBILE",
"user_agent": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)"
},
"_comment_session_params": "Parameters for the token generation session (primarily for Node.js)."
}'"""

View File

@ -0,0 +1,788 @@
#!/usr/bin/env python3
"""
Tool to stress-test video format download URLs from an info.json.
"""
import argparse
import collections
import concurrent.futures
import json
import logging
import os
import random
import re
import shlex
import signal
import subprocess
import sys
import threading
import time
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlparse, parse_qs
# Configure logging
logger = logging.getLogger('stress_formats_tool')
def get_video_id(url: str) -> str:
"""Extracts a YouTube video ID from a URL."""
# For URLs like https://www.youtube.com/watch?v=VIDEO_ID
match = re.search(r"v=([0-9A-Za-z_-]{11})", url)
if match:
return match.group(1)
# For URLs like https://youtu.be/VIDEO_ID
match = re.search(r"youtu\.be\/([0-9A-Za-z_-]{11})", url)
if match:
return match.group(1)
# For plain video IDs
if re.fullmatch(r'[0-9A-Za-z_-]{11}', url):
return url
return "unknown_video_id"
def get_display_name(path_or_url):
"""Returns a clean name for logging, either a filename or a video ID."""
if isinstance(path_or_url, Path):
return path_or_url.name
path_str = str(path_or_url)
video_id = get_video_id(path_str)
if video_id != "unknown_video_id":
return video_id
# Fallback for file paths as strings or weird URLs
return Path(path_str).name
def format_size(b):
"""Format size in bytes to human-readable string."""
if b is None:
return 'N/A'
if b < 1024:
return f"{b}B"
elif b < 1024**2:
return f"{b/1024:.2f}KiB"
elif b < 1024**3:
return f"{b/1024**2:.2f}MiB"
else:
return f"{b/1024**3:.2f}GiB"
class StatsTracker:
"""Tracks and reports statistics for the stress test."""
def __init__(self, stats_file=None):
self.events = []
self.start_time = time.time()
self.lock = threading.Lock()
self.stats_file_path = stats_file
self.stats_file_handle = None
if self.stats_file_path:
try:
self.stats_file_handle = open(self.stats_file_path, 'a', encoding='utf-8')
except IOError as e:
logger.error(f"Could not open stats file {self.stats_file_path}: {e}")
def log_event(self, event_data):
"""Log a download attempt event."""
with self.lock:
event_data['timestamp'] = datetime.now().isoformat()
self.events.append(event_data)
if self.stats_file_handle:
self.stats_file_handle.write(json.dumps(event_data) + '\n')
self.stats_file_handle.flush()
def close(self):
"""Close the stats file."""
if self.stats_file_handle:
self.stats_file_handle.close()
def print_summary(self):
"""Print a summary of the test run."""
with self.lock:
if not self.events:
logger.info("No events were recorded.")
return
duration = time.time() - self.start_time
# Separate events by type
fetch_events = [e for e in self.events if e.get('type') == 'fetch']
download_events = [e for e in self.events if e.get('type') != 'fetch'] # Default to download for old events
logger.info("\n--- Test Summary ---")
logger.info(f"Total duration: {duration:.2f} seconds")
if fetch_events:
total_fetches = len(fetch_events)
successful_fetches = sum(1 for e in fetch_events if e['success'])
failed_fetches = total_fetches - successful_fetches
logger.info("\n--- Fetch Summary ---")
logger.info(f"Total info.json fetch attempts: {total_fetches}")
logger.info(f" - Successful: {successful_fetches}")
logger.info(f" - Failed: {failed_fetches}")
if total_fetches > 0:
success_rate = (successful_fetches / total_fetches) * 100
logger.info(f"Success rate: {success_rate:.2f}%")
if failed_fetches > 0:
error_counts = collections.Counter(e.get('error_type', 'Unknown') for e in fetch_events if not e['success'])
logger.info("Failure breakdown:")
for error_type, count in sorted(error_counts.items()):
logger.info(f" - {error_type}: {count}")
if download_events:
total_attempts = len(download_events)
successes = sum(1 for e in download_events if e['success'])
failures = total_attempts - successes
logger.info("\n--- Download Summary ---")
logger.info(f"Total download attempts: {total_attempts}")
logger.info(f" - Successful: {successes}")
logger.info(f" - Failed: {failures}")
if total_attempts > 0:
success_rate = (successes / total_attempts) * 100
logger.info(f"Success rate: {success_rate:.2f}%")
if duration > 1 and total_attempts > 0:
dpm = (total_attempts / duration) * 60
logger.info(f"Attempt rate: {dpm:.2f} attempts/minute")
# Download volume stats
total_bytes = sum(e.get('downloaded_bytes', 0) for e in download_events if e['success'])
if total_bytes > 0:
logger.info(f"Total data downloaded: {format_size(total_bytes)}")
if duration > 1:
bytes_per_second = total_bytes / duration
gb_per_hour = (bytes_per_second * 3600) / (1024**3)
gb_per_day = gb_per_hour * 24
logger.info(f"Download rate: {gb_per_hour:.3f} GB/hour ({gb_per_day:.3f} GB/day)")
if failures > 0:
error_counts = collections.Counter(e.get('error_type', 'Unknown') for e in download_events if not e['success'])
logger.info("Failure breakdown:")
for error_type, count in sorted(error_counts.items()):
logger.info(f" - {error_type}: {count}")
logger.info("--------------------")
def print_banner(args, info_jsons=None, urls=None):
"""Prints a summary of the test configuration."""
logger.info("--- Stress Test Configuration ---")
if args.urls_file:
if args.fetch_only:
logger.info(f"Mode: Fetch-only. Generating info.json files from URL list.")
else:
logger.info(f"Mode: Full-stack test from URL list.")
logger.info(f"URL file: {args.urls_file} ({len(urls)} URLs)")
logger.info(f"Workers: {args.workers}")
logger.info(f"Info.json command: {args.info_json_gen_cmd}")
if args.info_json_gen_cmd_alt and args.alt_cmd_every_n > 0:
logger.info(f"Alternate command (every {args.alt_cmd_every_n} URLs): {args.info_json_gen_cmd_alt}")
if args.profile_prefix:
if args.profile_pool:
logger.info(f"Profile mode: Pool of {args.profile_pool} (prefix: {args.profile_prefix})")
elif args.profile_per_request:
logger.info(f"Profile mode: New profile per request (prefix: {args.profile_prefix})")
else: # info-json-files
logger.info(f"Mode: Download-only from static info.json files.")
if info_jsons:
logger.info(f"Files: {', '.join(str(p.name) for p in info_jsons.keys())}")
logger.info(f"Workers: {args.workers}")
logger.info(f"Format selection: {args.format}")
logger.info(f"Sleep between cycles: {args.sleep}s")
if args.sleep_formats > 0:
logger.info(f"Sleep between formats: {args.sleep_formats}s")
if args.duration > 0:
logger.info(f"Test duration: {args.duration} minutes")
if args.max_attempts > 0:
logger.info(f"Max cycles: {args.max_attempts}")
logger.info(f"Stop on failure: {args.stop_on_failure}")
if args.stop_on_403:
logger.info(f"Stop on 403 error: True")
if args.stop_on_timeout:
logger.info(f"Stop on timeout: True")
logger.info(f"Stats file: {args.stats_file}")
if args.stats_interval > 0:
logger.info(f"Periodic stats interval: {args.stats_interval}s")
if args.format_download_args:
logger.info(f"Extra download args: {args.format_download_args}")
logger.info("Download volume: Tracking total data downloaded")
logger.info("---------------------------------")
def add_stress_formats_parser(subparsers):
"""Add the parser for the 'stress-formats' command."""
parser = subparsers.add_parser(
'stress-formats',
description="A simple, command-line driven stress-testing tool for basic scenarios.\nAll options are configured via flags. For more complex scenarios and advanced\nfeatures like rate limiting and client rotation, use the 'stress-policy' command.",
formatter_class=argparse.RawTextHelpFormatter,
help='Run simple, flag-driven stress tests.',
epilog="""
Usage examples:
# Test a format from a static info.json every 60 seconds
ytops-client stress-formats --info-json-files my_video.json -f 18 --sleep 60
# Test with multiple info.json files in parallel using 4 workers
ytops-client stress-formats --info-json-files "file1.json,file2.json,file3.json" -f 18 --sleep 60 --workers 4
# Fetch a new info.json for a URL and test a format every 5 minutes
ytops-client stress-formats --urls-file urls.txt --info-json-gen-cmd "bin/ytops-client get-info {url}" -f "18" --sleep 300
# Run the test for exactly 10 cycles, continuing on failure
ytops-client stress-formats --info-json-files my_video.json -f 18 --sleep 10 --max-attempts 10 --no-stop-on-failure
"""
)
source_group = parser.add_mutually_exclusive_group(required=True)
source_group.add_argument('--info-json-files', help='Comma-separated paths to static info.json files to use for testing.')
source_group.add_argument('--urls-file', help='Path to a file with URLs/IDs to test. Can be a text file (one per line) or a JSON array of strings.')
parser.add_argument('-f', '--format', help='The format selection string. Can be a comma-separated list of IDs (e.g., "18,137"), "all", "random:X%%" (e.g., "random:10%%"), or "random_from:ID1,ID2,..." to pick one from a list. Required unless --fetch-only is used.')
parser.add_argument('--sleep', type=int, default=60, help='Seconds to wait between batches of download attempts. Default: 60.')
parser.add_argument('--sleep-formats', type=int, default=0, help='Seconds to wait between format downloads within a single file/cycle. Default: 0.')
parser.add_argument('--max-attempts', type=int, default=0, help='Maximum number of test cycles. 0 means run indefinitely. Default: 0.')
parser.add_argument('--duration', type=int, default=0, help='Total duration to run the test in minutes. 0 means run indefinitely (or until max-attempts is reached). Default: 0.')
parser.add_argument('--stop-on-failure', action='store_true', help='Stop the test immediately after the first download failure.')
parser.add_argument('--no-stop-on-failure', dest='stop_on_failure', action='store_false', help='Continue testing even after a download failure. (Default)')
parser.set_defaults(stop_on_failure=False)
parser.add_argument('--stop-on-403', action='store_true', help='Stop the test immediately after a 403 Forbidden error.')
parser.add_argument('--stop-on-timeout', action='store_true', help='Stop the test immediately after a read timeout error.')
parser.add_argument('--fetch-only', action='store_true', help='When used with --urls-file, only fetch and save info.json files without performing download tests.')
parser.add_argument('--workers', type=int, default=1, help='Number of parallel workers for multi-file mode. Default: 1.')
parser.add_argument('--stats-file', default='stress_test_stats.jsonl', help='File to log statistics for each attempt. Default: stress_test_stats.jsonl')
parser.add_argument('--stats-interval', type=int, default=0, help='Interval in seconds to print stats summary periodically. 0 disables. Default: 0.')
# Arguments for info.json generation
parser.add_argument('--info-json-gen-cmd', help='Command template to generate info.json. Use {url}, {worker_id}, {cycle}, and {profile} as placeholders. Required with --urls-file.')
parser.add_argument('--info-json-gen-cmd-alt', help='Alternate command template for info.json generation.')
parser.add_argument('--alt-cmd-every-n', type=int, default=0, help='Use the alternate command for every N-th URL (e.g., N=3 means URLs 3, 6, 9...). Requires --info-json-gen-cmd-alt.')
# Profile generation options
profile_group = parser.add_argument_group('Profile Generation Options (for --urls-file mode)')
profile_group.add_argument('--profile-prefix', help='Base name for generated profile IDs (e.g., "test_user"). Used with --profile-pool or --profile-per-request.')
profile_group.add_argument('--profile-pool', type=int, metavar='N', help='Use a pool of N profiles. Profile ID will be {prefix}_{worker_id %% N}. Requires --profile-prefix.')
profile_group.add_argument('--profile-per-request', action='store_true', help='Generate a new unique profile ID for each request. Profile ID will be {prefix}_{timestamp}_{worker_id}. Requires --profile-prefix.')
# Arguments to pass to format_download.py
parser.add_argument('--format-download-args', nargs='+', help='Additional arguments to pass to the download tool. E.g., --proxy-rename s/old/new/ --cleanup')
parser.add_argument('--verbose', action='store_true', help='Enable verbose output.')
return parser
def run_command(cmd, input_data=None):
"""Runs a command, captures its output, and returns status."""
logger.debug(f"Running command: {' '.join(cmd)}")
try:
process = subprocess.Popen(
cmd,
stdin=subprocess.PIPE if input_data else None,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding='utf-8'
)
stdout, stderr = process.communicate(input=input_data)
return process.returncode, stdout, stderr
except FileNotFoundError:
logger.error(f"Command not found: {cmd[0]}. Make sure it's in your PATH.")
return -1, "", f"Command not found: {cmd[0]}"
except Exception as e:
logger.error(f"An error occurred while running command: {' '.join(cmd)}. Error: {e}")
return -1, "", str(e)
def run_download_worker(info_json_path, info_json_content, format_to_download, args):
"""
Performs a single download attempt. Designed to be run in a worker thread.
"""
# 1. Attempt download
download_cmd = [
sys.executable, '-m', 'ytops_client.cli', 'download',
'-f', format_to_download
]
if args.format_download_args:
# with nargs='+', this is a list.
# If it's one item, it might be a single quoted string of args that needs splitting.
if len(args.format_download_args) == 1:
download_cmd.extend(shlex.split(args.format_download_args[0]))
else:
# multiple items, assume they are already split by shell
download_cmd.extend(args.format_download_args)
display_name = get_display_name(info_json_path)
logger.info(f"[{display_name} @ {format_to_download}] Kicking off download process...")
retcode, stdout, stderr = run_command(download_cmd, input_data=info_json_content)
# 2. Check result
is_403_error = "HTTP Error 403" in stderr
is_timeout_error = "Read timed out" in stderr
result = {
'type': 'download',
'path': str(info_json_path),
'format': format_to_download,
'success': retcode == 0,
'error_type': None,
'details': '',
'downloaded_bytes': 0
}
if retcode == 0:
# Success
downloaded_filepath = ''
# The filename is the last non-empty line of stdout that doesn't look like a progress bar
lines = stdout.splitlines()
for line in reversed(lines):
if line and not line.strip().startswith('['):
downloaded_filepath = line.strip()
break
details_str = "OK"
if downloaded_filepath:
details_str = f"Downloaded: {Path(downloaded_filepath).name}"
# Parse download size from stderr
size_in_bytes = 0
size_match = re.search(r'\[download\]\s+100%\s+of\s+~?([0-9.]+)(B|KiB|MiB|GiB)', stderr)
if size_match:
value = float(size_match.group(1))
unit = size_match.group(2)
multipliers = {"B": 1, "KiB": 1024, "MiB": 1024**2, "GiB": 1024**3}
size_in_bytes = int(value * multipliers.get(unit, 1))
result['downloaded_bytes'] = size_in_bytes
details_str += f" ({size_match.group(1)}{unit})"
result['details'] = details_str
else:
# Failure
# Try to get the most relevant error line
error_lines = [line for line in stderr.strip().split('\n') if 'ERROR:' in line]
if error_lines:
result['details'] = error_lines[-1]
else:
# If no "ERROR:" line, use the last few lines of stderr for context.
last_lines = stderr.strip().split('\n')[-3:] # Get up to last 3 lines
result['details'] = ' | '.join(line.strip() for line in last_lines if line.strip())
if not result['details']:
result['details'] = "Unknown error (stderr was empty)"
if is_403_error:
result['error_type'] = 'HTTP 403'
elif is_timeout_error:
result['error_type'] = 'Timeout'
else:
result['error_type'] = f'Exit Code {retcode}'
return result
def process_info_json_cycle(path, content, args, stats):
"""
Processes one info.json file for one cycle, downloading selected formats sequentially.
Logs events and returns a list of results.
"""
results = []
should_stop_file = False
display_name = get_display_name(path)
# Determine formats to test based on the info.json content
try:
info_data = json.loads(content)
available_formats = info_data.get('formats', [])
if not available_formats:
logger.warning(f"[{display_name}] No formats found in info.json. Skipping.")
return []
available_format_ids = [f['format_id'] for f in available_formats]
formats_to_test = []
format_selection_mode = args.format.lower()
if format_selection_mode == 'all':
formats_to_test = available_format_ids
logger.info(f"[{display_name}] Testing all {len(formats_to_test)} available formats.")
elif format_selection_mode.startswith('random:'):
try:
percent_str = format_selection_mode.split(':')[1].rstrip('%')
percent = float(percent_str)
if not (0 < percent <= 100):
raise ValueError("Percentage must be between 0 and 100.")
count = max(1, int(len(available_format_ids) * (percent / 100.0)))
formats_to_test = random.sample(available_format_ids, k=count)
logger.info(f"[{display_name}] Randomly selected {len(formats_to_test)} formats ({percent}%) from all available to test: {', '.join(formats_to_test)}")
except (ValueError, IndexError) as e:
logger.error(f"[{display_name}] Invalid random format selection '{args.format}': {e}. Skipping.")
return []
elif format_selection_mode.startswith('random_from:'):
try:
choices_str = format_selection_mode.split(':', 1)[1]
if not choices_str:
raise ValueError("No formats provided after 'random_from:'.")
format_choices = [f.strip() for f in choices_str.split(',') if f.strip()]
# Filter the choices to only those available in the current info.json
valid_choices = [f for f in format_choices if f in available_format_ids]
if not valid_choices:
logger.warning(f"[{display_name}] None of the requested formats for random selection ({', '.join(format_choices)}) are available. Skipping.")
return []
formats_to_test = [random.choice(valid_choices)]
logger.info(f"[{display_name}] Randomly selected 1 format from your list to test: {formats_to_test[0]}")
except (ValueError, IndexError) as e:
logger.error(f"[{display_name}] Invalid random_from format selection '{args.format}': {e}. Skipping.")
return []
else:
# Standard comma-separated list
requested_formats = [f.strip() for f in args.format.split(',') if f.strip()]
formats_to_test = []
for req_fmt in requested_formats:
# Check for exact match first
if req_fmt in available_format_ids:
formats_to_test.append(req_fmt)
continue
# If no exact match, check for formats that start with this ID + '-'
# e.g., req_fmt '140' should match '140-0'
prefix_match = f"{req_fmt}-"
first_match = next((af for af in available_format_ids if af.startswith(prefix_match)), None)
if first_match:
logger.info(f"[{display_name}] Requested format '{req_fmt}' not found. Using first available match: '{first_match}'.")
formats_to_test.append(first_match)
else:
# This could be a complex selector like 'bestvideo' or '299/298', so keep it.
if req_fmt not in available_format_ids:
logger.warning(f"[{display_name}] Requested format '{req_fmt}' not found in available formats.")
formats_to_test.append(req_fmt)
except json.JSONDecodeError:
logger.error(f"[{display_name}] Failed to parse info.json. Skipping.")
return []
for i, format_id in enumerate(formats_to_test):
if should_stop_file:
break
# Check if the format URL is expired before attempting to download
format_details = next((f for f in available_formats if f.get('format_id') == format_id), None)
if format_details and 'url' in format_details:
parsed_url = urlparse(format_details['url'])
query_params = parse_qs(parsed_url.query)
expire_ts_str = query_params.get('expire', [None])[0]
if expire_ts_str and expire_ts_str.isdigit():
expire_ts = int(expire_ts_str)
if expire_ts < time.time():
logger.warning(f"[{display_name}] Skipping format '{format_id}' because its URL is expired.")
result = {
'type': 'download', 'path': str(path), 'format': format_id,
'success': True, 'error_type': 'Skipped',
'details': 'Download URL is expired', 'downloaded_bytes': 0
}
stats.log_event(result)
results.append(result)
continue # Move to the next format
result = run_download_worker(path, content, format_id, args)
stats.log_event(result)
results.append(result)
status = "SUCCESS" if result['success'] else f"FAILURE ({result['error_type']})"
logger.info(f"Result for {display_name} (format {format_id}): {status} - {result.get('details', 'OK')}")
if not result['success']:
# This flag stops processing more formats for THIS file in this cycle
# The main loop will decide if all cycles should stop.
if args.stop_on_failure or \
(args.stop_on_403 and result['error_type'] == 'HTTP 403') or \
(args.stop_on_timeout and result['error_type'] == 'Timeout'):
logger.info(f"Stopping further format tests for {display_name} in this cycle due to failure.")
should_stop_file = True
# Sleep between formats if needed
if args.sleep_formats > 0 and i < len(formats_to_test) - 1:
logger.info(f"Sleeping for {args.sleep_formats}s before next format for {display_name}...")
time.sleep(args.sleep_formats)
return results
def main_stress_formats(args):
"""Main logic for the 'stress-formats' command."""
# The --format argument is required unless we are only fetching info.json files.
if not args.fetch_only and not args.format:
logger.error("Error: argument -f/--format is required when not using --fetch-only.")
return 1
if (args.profile_pool or args.profile_per_request) and not args.profile_prefix:
logger.error("--profile-prefix is required when using --profile-pool or --profile-per-request.")
return 1
if args.urls_file and args.fetch_only and not args.info_json_gen_cmd:
logger.error("--info-json-gen-cmd is required when using --urls-file with --fetch-only.")
return 1
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
else:
# Make the default logger more concise for test output
for handler in logging.root.handlers:
handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s', datefmt='%H:%M:%S'))
stats = StatsTracker(args.stats_file)
start_time = time.time()
duration_seconds = args.duration * 60 if args.duration > 0 else 0
# --- Load sources ---
info_jsons = {}
urls = []
if args.info_json_files:
info_json_files = [Path(p.strip()) for p in args.info_json_files.split(',')]
for file_path in info_json_files:
if not file_path.is_file():
logger.error(f"Info.json file not found: {file_path}")
continue
try:
with open(file_path, 'r', encoding='utf-8') as f:
info_jsons[file_path] = f.read()
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Failed to read or parse {file_path}: {e}")
if not info_jsons:
logger.error("No valid info.json files to process. Exiting.")
return 1
logger.info(f"Loaded {len(info_jsons)} info.json file(s).")
print_banner(args, info_jsons=info_jsons)
elif args.urls_file:
if not args.info_json_gen_cmd:
logger.error("--info-json-gen-cmd is required when using --urls-file.")
return 1
try:
with open(args.urls_file, 'r', encoding='utf-8') as f:
content = f.read()
# Try parsing as JSON array first
try:
data = json.loads(content)
if isinstance(data, list) and all(isinstance(item, str) for item in data):
urls = data
logger.info(f"Loaded {len(urls)} URLs/IDs from JSON array in {args.urls_file}.")
else:
# Valid JSON, but not a list of strings. Treat as error to avoid confusion.
logger.error(f"URL file '{args.urls_file}' is valid JSON but not an array of strings.")
return 1
except json.JSONDecodeError:
# Fallback to line-by-line parsing for plain text files
urls = [line.strip() for line in content.splitlines() if line.strip()]
logger.info(f"Loaded {len(urls)} URLs/IDs from text file {args.urls_file}.")
if not urls:
logger.error(f"URL file '{args.urls_file}' is empty or contains no valid URLs/IDs.")
return 1
except IOError as e:
logger.error(f"Failed to read URL file {args.urls_file}: {e}")
return 1
# Clean up URLs/IDs which might have extra quotes, commas, or brackets from copy-pasting
cleaned_urls = []
for url in urls:
# Strip whitespace, then trailing comma, then surrounding junk, then whitespace again
cleaned_url = url.strip().rstrip(',').strip().strip('\'"[]').strip()
if cleaned_url:
cleaned_urls.append(cleaned_url)
if len(cleaned_urls) != len(urls):
logger.info(f"Cleaned URL list, removed {len(urls) - len(cleaned_urls)} empty or invalid entries.")
urls = cleaned_urls
if not urls:
logger.error("URL list is empty after cleaning. Exiting.")
return 1
print_banner(args, urls=urls)
# --- Main test loop ---
cycles = 0
last_stats_print_time = time.time()
try:
# --- Worker function for URL mode ---
def process_url_task(url, url_index, cycle_num):
"""Worker to generate info.json for a URL and then test formats."""
# 1. Generate profile name if configured
profile_name = None
if args.profile_prefix:
if args.profile_pool:
profile_name = f"{args.profile_prefix}_{url_index % args.profile_pool}"
elif args.profile_per_request:
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
profile_name = f"{args.profile_prefix}_{timestamp}_{url_index}"
# 2. Select and format the generation command
gen_cmd_template = args.info_json_gen_cmd
if args.alt_cmd_every_n > 0 and args.info_json_gen_cmd_alt and (url_index + 1) % args.alt_cmd_every_n == 0:
gen_cmd_template = args.info_json_gen_cmd_alt
logger.info(f"Using alternate command for URL #{url_index + 1}: {url}")
try:
# shlex.split handles quoted arguments in the template
video_id = get_video_id(url)
gen_cmd = []
template_args = shlex.split(gen_cmd_template)
# If the video ID could be mistaken for an option, and it appears to be
# a positional argument, insert '--' to prevent misinterpretation.
if video_id.startswith('-'):
try:
# Heuristic: if {url} is the last token, it's likely positional.
if template_args and template_args[-1] == '{url}':
template_args.insert(-1, '--')
except (ValueError, IndexError):
pass # {url} not found or list is empty.
for arg in template_args:
# Replace placeholders
formatted_arg = arg.replace('{url}', video_id) \
.replace('{worker_id}', str(url_index)) \
.replace('{cycle}', str(cycle_num))
if profile_name:
formatted_arg = formatted_arg.replace('{profile}', profile_name)
gen_cmd.append(formatted_arg)
# Pass verbose flag through if set
if args.verbose and 'get_info_json_client.py' in gen_cmd_template and '--verbose' not in gen_cmd_template:
gen_cmd.append('--verbose')
except Exception as e:
logger.error(f"Failed to format --info-json-gen-cmd: {e}")
stats.log_event({'path': url, 'success': False, 'error_type': 'BadGenCmd', 'details': 'Cmd format error'})
return []
# 3. Run command to get info.json
log_msg = f"[{url}] Generating info.json"
if profile_name:
log_msg += f" with profile '{profile_name}'"
log_msg += "..."
logger.info(log_msg)
retcode, stdout, stderr = run_command(gen_cmd)
if retcode != 0:
error_msg = stderr.strip().split('\n')[-1]
logger.error(f"[{url}] Failed to generate info.json: {error_msg}")
event = {'type': 'fetch', 'path': url, 'success': False, 'error_type': 'GetInfoJsonFail', 'details': error_msg}
stats.log_event(event)
return [] # Return empty list, as no formats were tested
# Handle --fetch-only
if args.fetch_only:
logger.info(f"[{url}] Successfully fetched info.json. Skipping download due to --fetch-only.")
event = {'type': 'fetch', 'path': url, 'success': True, 'details': 'OK'}
stats.log_event(event)
return [] # Return empty list, indicating no downloads to check for failure
# 4. Pass to the format processing function
return process_info_json_cycle(url, stdout, args, stats)
while True:
if duration_seconds and (time.time() - start_time) > duration_seconds:
logger.info(f"Reached duration limit of {args.duration} minutes. Stopping.")
break
cycles += 1
if args.max_attempts > 0 and cycles > args.max_attempts:
logger.info(f"Reached max cycles ({args.max_attempts}). Stopping.")
break
logger.info(f"--- Cycle #{cycles} ---")
with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
future_to_identifier = {}
if args.info_json_files:
future_to_identifier = {
executor.submit(process_info_json_cycle, path, content, args, stats): path
for path, content in info_jsons.items()
}
elif args.urls_file:
future_to_identifier = {
executor.submit(process_url_task, url, i, cycles): url
for i, url in enumerate(urls)
}
should_stop = False
# Use a set of futures that we can modify while iterating
futures = set(future_to_identifier.keys())
while futures and not should_stop:
# Wait for the next future to complete
done, futures = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)
for future in done:
identifier = future_to_identifier[future]
identifier_name = get_display_name(identifier)
try:
results = future.result()
# Check if any result from this file triggers a global stop
for result in results:
if not result['success']:
if args.stop_on_failure:
logger.info(f"Failure on {identifier_name} (format {result['format']}). Shutting down all workers due to --stop-on-failure.")
should_stop = True
elif args.stop_on_403 and result['error_type'] == 'HTTP 403':
logger.info(f"403 error on {identifier_name} (format {result['format']}). Shutting down all workers due to --stop-on-403.")
should_stop = True
elif args.stop_on_timeout and result['error_type'] == 'Timeout':
logger.info(f"Timeout on {identifier_name} (format {result['format']}). Shutting down all workers due to --stop-on-timeout.")
should_stop = True
except Exception as exc:
logger.error(f'{identifier_name} generated an exception: {exc}')
stats.log_event({'path': str(identifier), 'success': False, 'error_type': 'Exception', 'details': str(exc)})
if should_stop:
break # Stop processing results from 'done' set
# Check for duration limit after each batch of tasks completes
if duration_seconds and (time.time() - start_time) > duration_seconds:
logger.info(f"Reached duration limit of {args.duration} minutes. Cancelling remaining tasks.")
should_stop = True
# If the loop was exited, cancel any remaining tasks
if should_stop and futures:
logger.info(f"Cancelling {len(futures)} outstanding task(s).")
for future in futures:
future.cancel()
if should_stop:
break
if args.stats_interval > 0 and (time.time() - last_stats_print_time) >= args.stats_interval:
stats.print_summary()
last_stats_print_time = time.time()
if args.max_attempts > 0 and cycles >= args.max_attempts:
break
logger.info(f"Cycle complete. Sleeping for {args.sleep} seconds...")
# Interruptible sleep that respects the total test duration
sleep_end_time = time.time() + args.sleep
should_stop_after_sleep = False
while time.time() < sleep_end_time:
if duration_seconds and (time.time() - start_time) >= duration_seconds:
logger.info(f"Reached duration limit of {args.duration} minutes during sleep. Stopping.")
should_stop_after_sleep = True
break
time.sleep(1) # Check every second
if should_stop_after_sleep:
break
except KeyboardInterrupt:
logger.info("\nCtrl+C received, shutting down...")
finally:
stats.print_summary()
stats.close()
return 0 if not any(not e['success'] for e in stats.events) else 1

File diff suppressed because it is too large Load Diff