yt-dlp-dags/ytops_client/download_native_py_tool.py

350 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Tool to download a specified format using yt-dlp as a Python library.
"""
import argparse
import contextlib
import io
import json
import logging
import os
import re
import shlex
import sys
import time
from datetime import datetime
try:
import yt_dlp
except ImportError:
print("yt-dlp is not installed. Please install it with: pip install yt-dlp", file=sys.stderr)
sys.exit(1)
logger = logging.getLogger('download_native_py_tool')
# A custom logger for yt-dlp to capture output and key events
class YTDLPLogger:
def __init__(self):
self.final_filename = None
self.is_403 = False
self.is_timeout = False
def debug(self, msg):
# yt-dlp logs the destination file path at the debug level.
if msg.startswith('[download] Destination:'):
self.final_filename = msg.split(':', 1)[1].strip()
elif msg.startswith('[download]') and 'has already been downloaded' in msg:
match = re.search(r'\[download\]\s+(.*)\s+has already been downloaded', msg)
if match:
self.final_filename = match.group(1).strip()
logger.debug(msg)
def info(self, msg):
logger.info(msg)
def warning(self, msg):
logger.warning(msg)
def error(self, msg):
if "HTTP Error 403" in msg:
self.is_403 = True
if "Read timed out" in msg:
self.is_timeout = True
logger.error(msg)
def ytdlp_progress_hook(d, ytdlp_logger):
"""Progress hook to capture the final filename."""
if d['status'] == 'finished':
ytdlp_logger.final_filename = d.get('filename')
logger.info(f"Download finished. Final file: {ytdlp_logger.final_filename}")
def add_download_native_py_parser(subparsers):
"""Add the parser for the 'download py' command."""
parser = subparsers.add_parser(
'py',
description='Download using yt-dlp as a Python library (recommended). This method calls yt-dlp functions directly.',
formatter_class=argparse.RawTextHelpFormatter,
help='Download using a direct Python call to yt-dlp (recommended).'
)
parser.add_argument('--load-info-json', type=argparse.FileType('r', encoding='utf-8'), help="Path to the info.json file. If not provided, reads from stdin.")
parser.add_argument('-f', '--format', required=True, help='The format selection string to download (e.g., "18", "299/137", "bestvideo+bestaudio").')
parser.add_argument('--output-dir', default='.', help='Directory to save the downloaded file. Defaults to current directory.')
parser.add_argument('--save-info-json-dir', help='If specified, save the info.json received from stdin to this directory with an auto-generated name.')
parser.add_argument('--proxy', help='Proxy to use for the download, e.g., "socks5://127.0.0.1:1080".')
parser.add_argument('--proxy-rename', help='Apply sed-style regex substitution to the proxy URL. Format: s/pattern/replacement/')
parser.add_argument('--temp-path', help='Directory for temporary files (e.g., fragments). Use a RAM disk for best performance.')
parser.add_argument('--pause', type=int, default=0, help='Seconds to wait before starting the download.')
parser.add_argument('--download-continue', action='store_true', help='Enable download continuation (--no-overwrites and --continue flags for yt-dlp).')
parser.add_argument('--verbose', action='store_true', help='Enable verbose output for this script and yt-dlp.')
parser.add_argument('--cli-config', help='Path to a yt-dlp configuration file to load.')
parser.add_argument('--downloader', help='Name of the external downloader backend for yt-dlp to use (e.g., "aria2c", "native").')
parser.add_argument('--downloader-args', help='Arguments to pass to the external downloader backend (e.g., "aria2c:-x 8").')
parser.add_argument('--extra-ytdlp-args', help='A string of extra command-line arguments to pass to yt-dlp.')
parser.add_argument('--output-buffer', action='store_true', help='Download to an in-memory buffer and print raw bytes to stdout. Final filename is printed to stderr.')
parser.add_argument('--cleanup', action='store_true', help='After download, rename the file to include a timestamp and truncate it to 0 bytes.')
parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.')
return parser
def main_download_native_py(args):
"""Main logic for the 'download-native-py' command."""
# If outputting to buffer, all logging must go to stderr to keep stdout clean for binary data.
log_stream = sys.stderr if args.output_buffer else sys.stdout
log_level = logging.DEBUG if args.verbose else logging.INFO
# Reconfigure root logger
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
logging.basicConfig(level=log_level, stream=log_stream, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
if args.pause > 0:
logger.info(f"Pausing for {args.pause} seconds...")
time.sleep(args.pause)
info_json_content = ""
input_source_name = ""
if args.load_info_json:
info_json_content = args.load_info_json.read()
input_source_name = args.load_info_json.name
else:
info_json_content = sys.stdin.read()
input_source_name = "stdin"
if not info_json_content.strip():
logger.error(f"Failed to read info.json from {input_source_name}. Input is empty.")
return 1
try:
info_data = json.loads(info_json_content)
logger.info(f"Successfully loaded info.json from {input_source_name}.")
except json.JSONDecodeError:
logger.error(f"Failed to parse info.json from {input_source_name}. Is the input valid JSON?")
return 1
if args.save_info_json_dir:
try:
video_id = info_data.get('id', 'unknown_video_id')
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"{timestamp}-{video_id}-info.json"
output_path = os.path.join(args.save_info_json_dir, filename)
os.makedirs(args.save_info_json_dir, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(info_data, f, indent=2)
logger.info(f"Saved info.json to {output_path}")
except Exception as e:
logger.error(f"Failed to save info.json: {e}")
# Handle proxy and proxy rename
proxy_url = args.proxy
if not proxy_url:
proxy_url = info_data.get('_proxy_url')
if proxy_url:
logger.info(f"Using proxy from info.json: {proxy_url}")
if proxy_url and args.proxy_rename:
rename_rule = args.proxy_rename.strip("'\"")
if rename_rule.startswith('s/') and rename_rule.count('/') >= 2:
try:
parts = rename_rule.split('/')
pattern, replacement = parts[1], parts[2]
original_proxy = proxy_url
proxy_url = re.sub(pattern, replacement, proxy_url)
logger.info(f"Renamed proxy URL from '{original_proxy}' to '{proxy_url}' using rule '{rename_rule}'")
except re.error as e:
logger.error(f"Invalid regex in --proxy-rename: {e}")
return 1
else:
logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/")
return 1
# Build the yt-dlp options dictionary
# Start by parsing options from config file and extra args to establish a baseline.
base_opts_args = []
if args.cli_config and os.path.exists(args.cli_config):
try:
with open(args.cli_config, 'r', encoding='utf-8') as f:
config_content = f.read()
base_opts_args.extend(shlex.split(config_content))
logger.info(f"Loaded {len(base_opts_args)} arguments from config file: {args.cli_config}")
except Exception as e:
logger.error(f"Failed to read or parse config file {args.cli_config}: {e}")
return 1
elif args.cli_config:
logger.warning(f"Config file '{args.cli_config}' not found. Ignoring.")
if args.extra_ytdlp_args:
extra_args_list = shlex.split(args.extra_ytdlp_args)
logger.info(f"Adding {len(extra_args_list)} extra arguments from --extra-ytdlp-args.")
base_opts_args.extend(extra_args_list)
ydl_opts = {}
if base_opts_args:
try:
logger.info(f"Parsing {len(base_opts_args)} arguments from config/extra_args...")
i = 0
while i < len(base_opts_args):
arg = base_opts_args[i]
if not arg.startswith('--'):
logger.warning(f"Skipping non-option argument in extra args: {arg}")
i += 1
continue
key = arg.lstrip('-').replace('-', '_')
# Handle flags (no value)
is_flag = i + 1 >= len(base_opts_args) or base_opts_args[i + 1].startswith('--')
if is_flag:
if key.startswith('no_'):
# Handle --no-foo flags
ydl_opts[key[3:]] = False
else:
ydl_opts[key] = True
logger.debug(f"Parsed flag: {key} = {ydl_opts.get(key[3:] if key.startswith('no_') else key)}")
i += 1
# Handle options with values
else:
value = base_opts_args[i + 1]
# Try to convert values to numbers, which yt-dlp expects.
# This includes parsing byte suffixes like 'K', 'M', 'G'.
if isinstance(value, str):
original_value = value
value_upper = value.upper()
multipliers = {'K': 1024, 'M': 1024**2, 'G': 1024**3, 'T': 1024**4}
if value_upper and value_upper[-1] in multipliers:
try:
num = float(value[:-1])
value = int(num * multipliers[value_upper[-1]])
except (ValueError, TypeError):
value = original_value # fallback
else:
try:
value = int(value)
except (ValueError, TypeError):
try:
value = float(value)
except (ValueError, TypeError):
value = original_value # fallback
# Special handling for keys that differ from CLI arg, e.g. --limit-rate -> ratelimit
if key == 'limit_rate':
key = 'ratelimit'
ydl_opts[key] = value
logger.debug(f"Parsed option: {key} = {value}")
i += 2
logger.info("Successfully parsed extra yt-dlp options.")
except Exception as e:
logger.error(f"Failed to parse options from config/extra_args: {e}", exc_info=True)
return 1
# Now, layer the script's explicit arguments on top, as they have higher precedence.
os.makedirs(args.output_dir, exist_ok=True)
output_template = os.path.join(args.output_dir, '%(title)s [%(id)s].f%(format_id)s.%(ext)s')
ytdlp_logger = YTDLPLogger()
# Use update to merge, so explicit args overwrite config/extra args.
ydl_opts.update({
'format': args.format,
'outtmpl': '-' if args.output_buffer else output_template,
'logger': ytdlp_logger,
'progress_hooks': [lambda d: ytdlp_progress_hook(d, ytdlp_logger)],
'verbose': args.verbose,
})
if args.temp_path:
ydl_opts['paths'] = {'temp': args.temp_path}
logger.info(f"Using temporary path: {args.temp_path}")
if args.download_continue:
ydl_opts['continuedl'] = True
ydl_opts['nooverwrites'] = True
if proxy_url:
ydl_opts['proxy'] = proxy_url
if args.downloader:
ydl_opts['downloader'] = {args.downloader: None}
if args.downloader_args:
# yt-dlp expects a dict for downloader_args
# e.g., {'aria2c': ['-x', '8']}
try:
downloader_name, args_str = args.downloader_args.split(':', 1)
ydl_opts.setdefault('downloader_args', {})[downloader_name] = shlex.split(args_str)
except ValueError:
logger.error(f"Invalid --downloader-args format. Expected 'downloader:args'. Got: '{args.downloader_args}'")
return 1
if args.merge_output_format:
ydl_opts['merge_output_format'] = args.merge_output_format
try:
logger.info(f"Starting download for format '{args.format}' using yt-dlp library...")
download_buffer = None
if args.output_buffer:
# When downloading to buffer, we redirect stdout to capture the binary data.
download_buffer = io.BytesIO()
ctx_mgr = contextlib.redirect_stdout(download_buffer)
else:
# Otherwise, use a null context manager.
ctx_mgr = contextlib.nullcontext()
with ctx_mgr, yt_dlp.YoutubeDL(ydl_opts) as ydl:
# The download() method is for URLs. For a pre-fetched info dict,
# we must use process_ie_result to bypass the info extraction step.
# It raises DownloadError on failure, which is caught by the outer try...except block.
ydl.process_ie_result(info_data)
# If process_ie_result completes without an exception, the download was successful.
retcode = 0
# The success path is now always taken if no exception was raised.
if retcode == 0:
logger.info("yt-dlp download completed successfully.")
if args.output_buffer:
# Write the captured binary data to the actual stdout.
sys.stdout.buffer.write(download_buffer.getvalue())
sys.stdout.buffer.flush()
# Print the filename to stderr for the orchestrator.
if ytdlp_logger.final_filename:
print(ytdlp_logger.final_filename, file=sys.stderr)
else:
# Print the filename to stdout as usual.
if ytdlp_logger.final_filename:
print(ytdlp_logger.final_filename, file=sys.stdout)
if args.cleanup:
downloaded_filepath = ytdlp_logger.final_filename
if downloaded_filepath and os.path.exists(downloaded_filepath):
try:
logger.info(f"Cleanup: Renaming and truncating '{downloaded_filepath}'")
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
directory, original_filename = os.path.split(downloaded_filepath)
filename_base, filename_ext = os.path.splitext(original_filename)
new_filename = f"{filename_base}_{timestamp}{filename_ext}.empty"
new_filepath = os.path.join(directory, new_filename)
os.rename(downloaded_filepath, new_filepath)
logger.info(f"Renamed to '{new_filepath}'")
with open(new_filepath, 'w') as f:
pass
logger.info(f"Truncated '{new_filepath}' to 0 bytes.")
except Exception as e:
logger.error(f"Cleanup failed: {e}")
return 1 # Treat cleanup failure as a script failure
elif not args.output_buffer:
logger.warning("Cleanup requested, but no downloaded file was found. Skipping cleanup.")
return 0
else:
logger.error(f"yt-dlp download failed with internal exit code {retcode}.")
return 1
except yt_dlp.utils.DownloadError as e:
# This catches download-specific errors from yt-dlp
logger.error(f"yt-dlp DownloadError: {e}")
return 1
except Exception as e:
logger.exception(f"An unexpected error occurred during yt-dlp execution: {e}")
return 1