yt-dlp-dags/ytops_client/download_tool.py

#!/usr/bin/env python3
"""
Tool to download a specified format using an info.json from stdin.
"""

import argparse
import json
import logging
import os
import re
import shlex
import subprocess
import sys
import tempfile
import time
from datetime import datetime

# Configure logging
logger = logging.getLogger('download_tool')

def add_download_parser(subparsers):
    """Add the parser for the 'download cli' command."""
    parser = subparsers.add_parser(
        'cli',
        description='Download using the legacy yt-dlp CLI wrapper. This method invokes yt-dlp as a subprocess.',
        formatter_class=argparse.RawTextHelpFormatter,
        help='Download using the legacy yt-dlp CLI wrapper.'
    )
    parser.add_argument('--load-info-json', type=argparse.FileType('r', encoding='utf-8'), help="Path to the info.json file. If not provided, reads from stdin.")
    parser.add_argument('-f', '--format', required=True, help='The format selection string to download (e.g., "18", "299/137", "bestvideo+bestaudio").')
    parser.add_argument('--output-dir', default='.', help='Directory to save the downloaded file. Defaults to current directory.')
    parser.add_argument('--save-info-json-dir', help='If specified, save the info.json received from stdin to this directory with an auto-generated name.')
    parser.add_argument('--proxy', help='Proxy to use for the download, e.g., "socks5://127.0.0.1:1080". This option sets the proxy, overriding any value from the info.json.')
    parser.add_argument('--proxy-rename', help='Apply sed-style regex substitution to the proxy URL. Format: s/pattern/replacement/')
    parser.add_argument('--pause', type=int, default=0, help='Seconds to wait before starting the download.')
    parser.add_argument('--print-traffic', action='store_true', help='Print traffic instead of a progress bar.')
    parser.add_argument('--download-continue', action='store_true', help='Enable download continuation (--continue and --part flags for yt-dlp).')
    parser.add_argument('--verbose', action='store_true', help='Enable verbose output for this script and yt-dlp.')
    parser.add_argument('--cli-config', default='cli.config', help='Path to a yt-dlp configuration file. Defaults to "cli.config".')
    parser.add_argument('--cleanup', action='store_true', help='After download, rename the file to include a timestamp and truncate it to 0 bytes.')
    parser.add_argument('--log-file', help='Append full yt-dlp output to the specified log file.')
    parser.add_argument('--yt-dlp-path', default='yt-dlp', help='Path to the yt-dlp executable. Defaults to "yt-dlp" in PATH.')
    parser.add_argument('--extra-ytdlp-args', help='A string of extra command-line arguments to pass to yt-dlp.')
    parser.add_argument('--downloader', help='Name of the external downloader to use (e.g., "aria2c", "native").')
    parser.add_argument('--downloader-args', help='Arguments to pass to the external downloader (e.g., "aria2c:-x 8").')
    parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.')
    parser.add_argument('--retries', help='Number of retries for the entire download (default: 10).')
    parser.add_argument('--fragment-retries', help='Number of retries for each fragment (default: 10).')
    parser.add_argument('--socket-timeout', help='Timeout for socket operations in seconds (default: 20).')
    parser.add_argument('--lang', help='Language code for the request (e.g., "fr", "ja"). Affects metadata language.')
    parser.add_argument('--timezone', help='Timezone for the request (e.g., "UTC", "America/New_York"). Note: not supported by yt-dlp.')
    # Arguments to pass through to yt-dlp
    parser.add_argument('--download-sections', help='yt-dlp --download-sections argument (e.g., "*0-10240").')
    parser.add_argument('--test', action='store_true', help='yt-dlp --test argument (download small part).')
    return parser

def main_download(args):
    """Main logic for the 'download' command."""
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    if args.pause > 0:
        logger.info(f"Pausing for {args.pause} seconds...")
        time.sleep(args.pause)

    info_json_content = ""
    input_source_name = ""
    if args.load_info_json:
        info_json_content = args.load_info_json.read()
        input_source_name = args.load_info_json.name
    else:
        info_json_content = sys.stdin.read()
        input_source_name = "stdin"

    if not info_json_content.strip():
        logger.error(f"Failed to read info.json from {input_source_name}. Input is empty.")
        return 1

    try:
        info_data = json.loads(info_json_content)
        logger.info(f"Successfully loaded info.json from {input_source_name}.")
    except json.JSONDecodeError:
        logger.error(f"Failed to parse info.json from {input_source_name}. Is the input valid JSON?")
        return 1

    if args.save_info_json_dir:
        try:
            video_id = info_data.get('id', 'unknown_video_id')
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"{timestamp}-{video_id}-info.json"
            output_path = os.path.join(args.save_info_json_dir, filename)
            os.makedirs(args.save_info_json_dir, exist_ok=True)
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(info_data, f, indent=2)
            logger.info(f"Saved info.json to {output_path}")
        except Exception as e:
            logger.error(f"Failed to save info.json: {e}")

    # Determine proxy to use
    proxy_url = args.proxy
    if not proxy_url:
        proxy_url = info_data.get('_proxy_url')
        if proxy_url:
            logger.info(f"Using proxy from info.json: {proxy_url}")

    if proxy_url and args.proxy_rename:
        rename_rule = args.proxy_rename
        # The user's command line might include quotes that are preserved by shlex.
        # Strip them to get the raw rule.
        rename_rule = rename_rule.strip("'\"")
        if rename_rule.startswith('s/') and rename_rule.count('/') >= 2:
            try:
                parts = rename_rule.split('/')
                pattern = parts[1]
                replacement = parts[2]
                original_proxy = proxy_url
                proxy_url = re.sub(pattern, replacement, proxy_url)
                logger.info(f"Renamed proxy URL from '{original_proxy}' to '{proxy_url}' using rule '{rename_rule}'")
            except re.error as e:
                logger.error(f"Invalid regex in --proxy-rename: {e}")
                return 1
            except IndexError:
                logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/")
                return 1
        else:
            logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/")
            return 1

    # yt-dlp needs to load the info.json from a file
    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', encoding='utf-8') as tmp:
        json.dump(info_data, tmp)
        info_json_path = tmp.name

    logger.debug(f"Temporarily saved info.json to {info_json_path}")

    downloaded_filepath = None
    return_code = 1  # Default to error

    try:
        # Create output directory if it doesn't exist
        os.makedirs(args.output_dir, exist_ok=True)
        output_template = os.path.join(args.output_dir, '%(title)s [%(id)s].f%(format_id)s.%(ext)s')

        cmd = [
            args.yt_dlp_path,
            '--load-info-json', info_json_path,
            '-f', args.format,
            '-o', output_template,
            '--print', 'filename',
        ]

        if args.extra_ytdlp_args:
            cmd.extend(shlex.split(args.extra_ytdlp_args))

        if args.downloader:
            cmd.extend(['--downloader', args.downloader])
        if args.downloader_args:
            cmd.extend(['--downloader-args', args.downloader_args])
        if args.merge_output_format:
            cmd.extend(['--merge-output-format', args.merge_output_format])

        if args.download_sections:
            cmd.extend(['--download-sections', args.download_sections])

        if args.test:
            cmd.append('--test')

        if args.retries:
            cmd.extend(['--retries', str(args.retries)])
        if args.fragment_retries:
            cmd.extend(['--fragment-retries', str(args.fragment_retries)])
        if args.socket_timeout:
            cmd.extend(['--socket-timeout', str(args.socket_timeout)])

        if args.download_continue:
            cmd.extend(['--continue', '--part'])

        if os.path.exists(args.cli_config):
            logger.info(f"Using config file: {args.cli_config}")
            cmd.extend(['--config-location', args.cli_config])
        else:
            logger.info(f"Config file '{args.cli_config}' not found. Using yt-dlp defaults.")

        if args.print_traffic:
            cmd.append('--print-traffic')
            cmd.append('--no-progress')
        else:
            cmd.append('--progress')

        if args.verbose:
            cmd.append('--verbose')

        if proxy_url:
            cmd.extend(['--proxy', proxy_url])

        if args.lang:
            cmd.extend(['--extractor-args', f'youtube:lang={args.lang}'])

        if args.timezone:
            logger.warning(f"Timezone override ('{args.timezone}') is not supported by yt-dlp and will be ignored.")

        # Determine if we need to capture output.
        capture_output = args.cleanup or args.log_file or args.print_traffic

        if capture_output and not args.print_traffic:
            logger.info("Note: --cleanup or --log-file requires capturing output, which may affect progress bar display.")

        logger.info(f"Executing yt-dlp command for format '{args.format}'")

        # Construct a display version of the command for logging
        display_cmd_str = ' '.join(f"'{arg}'" if ' ' in arg else arg for arg in cmd)
        if os.path.exists(args.cli_config):
            try:
                with open(args.cli_config, 'r', encoding='utf-8') as f:
                    config_contents = ' '.join(f.read().split())
                    if config_contents:
                        logger.info(f"cli.config contents: {config_contents}")
            except IOError as e:
                logger.warning(f"Could not read config file {args.cli_config}: {e}")

        logger.info(f"Full command: {display_cmd_str}")

        if capture_output:
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='utf-8')

            log_f = None
            if args.log_file:
                try:
                    log_f = open(args.log_file, 'a', encoding='utf-8')
                    log_f.write(f"\n--- Log entry: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ---\n")
                    log_f.write(f"Command: {' '.join(cmd)}\n\n")
                except IOError as e:
                    logger.error(f"Failed to open log file {args.log_file}: {e}")

            stdout_data, stderr_data = process.communicate()
            return_code = process.returncode

            # Post-run check for silent failures, like 403 errors where yt-dlp might still exit 0.
            if return_code == 0:
                output_text = (stdout_data or "") + (stderr_data or "")
                if "HTTP Error 403" in output_text:
                    logger.error("yt-dlp exited successfully, but a 403 error was detected in its output. Forcing failure.")
                    return_code = 1  # Override success code
                elif "timed out" in output_text.lower() or "timeout" in output_text.lower():
                    logger.error("yt-dlp exited successfully, but a timeout was detected in its output. Forcing failure.")
                    return_code = 1

            # Write captured output to terminal and log file
            if stdout_data:
                sys.stdout.write(stdout_data)
                sys.stdout.flush()
                if log_f:
                    for line in stdout_data.splitlines(True):
                        log_f.write(f"[stdout] {line}")

            if stderr_data:
                sys.stderr.write(stderr_data)
                sys.stderr.flush()
                if log_f:
                    for line in stderr_data.splitlines(True):
                        log_f.write(f"[stderr] {line}")

            stdout_lines = stdout_data.splitlines() if stdout_data else []

            if log_f:
                log_f.write(f"\n--- End log entry (yt-dlp exit code: {return_code}) ---\n")
                log_f.close()

            for line in reversed(stdout_lines):
                if line and os.path.exists(line):
                    downloaded_filepath = line
                    logger.info(f"Detected downloaded file: {downloaded_filepath}")
                    break
        else:
            # Original behavior: progress bar direct to terminal, no capture
            process = subprocess.Popen(cmd)
            process.wait()
            return_code = process.returncode

        if return_code != 0:
            logger.error(f"yt-dlp exited with error code {return_code}")
        else:
            logger.info("yt-dlp command completed successfully.")

    except Exception as e:
        logger.exception(f"An unexpected error occurred: {e}")
        return 1
    finally:
        # Clean up the temporary file
        if os.path.exists(info_json_path):
            os.unlink(info_json_path)
            logger.debug(f"Removed temporary file {info_json_path}")

    # Cleanup phase
    if args.cleanup:
        if downloaded_filepath and os.path.exists(downloaded_filepath):
            try:
                logger.info(f"Cleanup: Renaming and truncating '{downloaded_filepath}'")

                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

                directory, original_filename = os.path.split(downloaded_filepath)
                filename_base, filename_ext = os.path.splitext(original_filename)

                # New name format is [base]_[timestamp][ext].empty
                new_filename = f"{filename_base}_{timestamp}{filename_ext}.empty"
                new_filepath = os.path.join(directory, new_filename)

                os.rename(downloaded_filepath, new_filepath)
                logger.info(f"Renamed to '{new_filepath}'")

                with open(new_filepath, 'w') as f:
                    pass
                logger.info(f"Truncated '{new_filepath}' to 0 bytes.")

            except Exception as e:
                logger.error(f"Cleanup failed: {e}")
                return 1
        else:
            logger.warning("Cleanup requested, but no downloaded file was found. Skipping cleanup.")

    return return_code