275 lines
12 KiB
Python
275 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
CLI tool to generate granular download task files from a directory of info.json files.
|
|
"""
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import signal
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Graceful shutdown handler
|
|
shutdown_event = False
|
|
def handle_shutdown(sig, frame):
|
|
global shutdown_event
|
|
logger.info("Shutdown signal received. Stopping task generator...")
|
|
shutdown_event = True
|
|
|
|
def sanitize_format_for_filename(format_str: str) -> str:
|
|
"""Sanitizes a format selector string to be filesystem-friendly."""
|
|
# Replace common problematic characters with underscores
|
|
sanitized = re.sub(r'[\\/+:\[\]\s]', '_', format_str)
|
|
# Remove any trailing characters that might be problematic
|
|
sanitized = sanitized.strip('._-')
|
|
return sanitized
|
|
|
|
def add_task_generator_parser(subparsers):
|
|
"""Adds the parser for the 'task-generator' command."""
|
|
parser = subparsers.add_parser(
|
|
'task-generator',
|
|
description="Generate granular download task files from info.jsons.",
|
|
formatter_class=argparse.RawTextHelpFormatter,
|
|
help="Generate granular download task files."
|
|
)
|
|
|
|
# All functionality is under subcommands for extensibility.
|
|
generate_subparsers = parser.add_subparsers(dest='task_generator_command', help='Action to perform', required=True)
|
|
|
|
gen_parser = generate_subparsers.add_parser(
|
|
'generate',
|
|
help='Generate task files from a source directory.',
|
|
description='Reads info.json files from a source directory and creates one task file per format in an output directory.'
|
|
)
|
|
gen_parser.add_argument('--source-dir', required=True, help='Directory containing the source info.json files.')
|
|
gen_parser.add_argument('--output-dir', required=True, help='Directory where the generated task files will be saved.')
|
|
gen_parser.add_argument('--formats', required=True, help='A comma-separated list of format IDs or selectors to generate tasks for (e.g., "18,140,bestvideo").')
|
|
gen_parser.add_argument('--live', action='store_true', help='Run continuously, watching the source directory for new files.')
|
|
gen_parser.add_argument('--interval-seconds', type=int, default=10, help='When in --live mode, how often to scan for new files.')
|
|
gen_parser.add_argument('--dummy', action='store_true', help='Generate dummy task files without reading info.json content. Useful for testing download workers.')
|
|
gen_parser.add_argument('--verbose', action='store_true', help='Enable verbose logging.')
|
|
|
|
reset_parser = generate_subparsers.add_parser(
|
|
'reset',
|
|
help='Reset processed source files.',
|
|
description='Finds all *.processed files in the source directory and renames them back to *.json to allow re-generation.'
|
|
)
|
|
reset_parser.add_argument('--source-dir', required=True, help='Directory containing the source info.json files to reset.')
|
|
reset_parser.add_argument('--verbose', action='store_true', help='Enable verbose logging.')
|
|
|
|
|
|
def _main_task_generator_reset(args):
|
|
"""Main logic for the 'reset' command."""
|
|
source_dir = Path(args.source_dir)
|
|
if not source_dir.is_dir():
|
|
logger.error(f"Source directory does not exist or is not a directory: {source_dir}")
|
|
return 1
|
|
|
|
logger.info(f"Scanning for *.processed and *.LOCKED.* files in '{source_dir}' (recursively) to reset...")
|
|
# Use rglob for recursive search
|
|
processed_files = list(source_dir.rglob('*.json.processed'))
|
|
locked_files = list(source_dir.rglob('*.json.LOCKED.*'))
|
|
files_to_reset = processed_files + locked_files
|
|
|
|
if not files_to_reset:
|
|
logger.info("No processed or locked files found to reset.")
|
|
return 0
|
|
|
|
reset_count = 0
|
|
for file_to_reset in files_to_reset:
|
|
original_path = None
|
|
if file_to_reset.name.endswith('.processed'):
|
|
# Handles cases like file.json.processed
|
|
original_path_str = str(file_to_reset).removesuffix('.processed')
|
|
original_path = Path(original_path_str)
|
|
elif '.LOCKED.' in file_to_reset.name:
|
|
# Handles cases like file.json.LOCKED.0
|
|
original_path_str = str(file_to_reset).split('.LOCKED.')[0]
|
|
original_path = Path(original_path_str)
|
|
|
|
if original_path:
|
|
try:
|
|
if original_path.exists():
|
|
logger.warning(f"Original file '{original_path.name}' already exists. Deleting '{file_to_reset.name}' instead of renaming.")
|
|
file_to_reset.unlink()
|
|
else:
|
|
file_to_reset.rename(original_path)
|
|
logger.debug(f"Reset '{file_to_reset.name}' to '{original_path.name}'")
|
|
reset_count += 1
|
|
except (IOError, OSError) as e:
|
|
logger.error(f"Failed to reset '{file_to_reset.name}': {e}")
|
|
else:
|
|
logger.warning(f"Could not determine original filename for '{file_to_reset.name}'. Skipping.")
|
|
|
|
logger.info(f"Successfully reset {reset_count} file(s).")
|
|
return 0
|
|
|
|
|
|
def main_task_generator(args):
|
|
"""Main logic for the 'task-generator' tool."""
|
|
if args.task_generator_command == 'generate':
|
|
return _main_task_generator_generate(args)
|
|
elif args.task_generator_command == 'reset':
|
|
return _main_task_generator_reset(args)
|
|
return 1
|
|
|
|
|
|
def _generate_tasks_for_file(source_file, output_dir, formats_to_generate, is_dummy_mode):
|
|
"""Helper function to generate task files for a single source info.json."""
|
|
try:
|
|
info_json_content = {}
|
|
if is_dummy_mode:
|
|
# In dummy mode, we don't read the file content. We create a minimal structure.
|
|
# We try to parse the filename to get video_id and profile_name for organization.
|
|
# Example filename: {video_id}-{profile_name}-{proxy}.info.json
|
|
parts = source_file.stem.split('-')
|
|
video_id = parts[0] if parts else 'dummy_video'
|
|
profile_name = next((p for p in parts if p.startswith('user')), None)
|
|
|
|
info_json_content = {
|
|
'id': video_id,
|
|
'_dummy': True,
|
|
'_ytops_metadata': {
|
|
'profile_name': profile_name
|
|
}
|
|
}
|
|
logger.debug(f"DUMMY MODE: Generating tasks for source file: {source_file.name}")
|
|
else:
|
|
with open(source_file, 'r', encoding='utf-8') as f:
|
|
info_json_content = json.load(f)
|
|
except (IOError, json.JSONDecodeError) as e:
|
|
logger.warning(f"Skipping file '{source_file.name}' due to read/parse error: {e}")
|
|
return 0
|
|
except Exception as e:
|
|
logger.error(f"An unexpected error occurred while processing '{source_file.name}': {e}")
|
|
return 0
|
|
|
|
tasks_generated_this_file = 0
|
|
try:
|
|
# Use metadata to create a profile-specific subdirectory for better organization.
|
|
profile_name_from_meta = info_json_content.get('_ytops_metadata', {}).get('profile_name')
|
|
final_output_dir = output_dir
|
|
if profile_name_from_meta:
|
|
final_output_dir = output_dir / profile_name_from_meta
|
|
# Ensure subdirectory exists. This is done once per source file.
|
|
try:
|
|
final_output_dir.mkdir(parents=True, exist_ok=True)
|
|
except OSError as e:
|
|
logger.error(f"Could not create profile subdirectory '{final_output_dir}': {e}. Skipping tasks for this source file.")
|
|
return 0
|
|
|
|
for format_str in formats_to_generate:
|
|
task_data = info_json_content.copy()
|
|
# Add the target format to the task data itself. This makes the task file self-contained.
|
|
task_data['_ytops_download_format'] = format_str
|
|
|
|
# Create a unique filename for the task
|
|
original_stem = source_file.stem
|
|
safe_format_str = sanitize_format_for_filename(format_str)
|
|
task_filename = f"{original_stem}-format-{safe_format_str}.json"
|
|
output_path = final_output_dir / task_filename
|
|
|
|
# Check if this specific task file already exists to avoid re-writing
|
|
if output_path.exists():
|
|
logger.debug(f"Task file already exists, skipping generation: {output_path}")
|
|
continue
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(task_data, f, indent=2)
|
|
logger.debug(f"Generated task file: {output_path}")
|
|
tasks_generated_this_file += 1
|
|
|
|
# Mark source file as processed now that we've iterated through all formats for it.
|
|
try:
|
|
processed_path = source_file.with_suffix(f"{source_file.suffix}.processed")
|
|
source_file.rename(processed_path)
|
|
logger.debug(f"Marked '{source_file.name}' as processed.")
|
|
except (IOError, OSError) as e:
|
|
logger.error(f"Failed to mark source file '{source_file.name}' as processed: {e}")
|
|
|
|
except IOError as e:
|
|
logger.error(f"An I/O error occurred while generating tasks for '{source_file.name}': {e}. It will be retried on the next run.")
|
|
# The file is not renamed, so it will be picked up again
|
|
|
|
return tasks_generated_this_file
|
|
|
|
|
|
def _main_task_generator_generate(args):
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
signal.signal(signal.SIGINT, handle_shutdown)
|
|
signal.signal(signal.SIGTERM, handle_shutdown)
|
|
|
|
source_dir = Path(args.source_dir)
|
|
output_dir = Path(args.output_dir)
|
|
formats_to_generate = [f.strip() for f in args.formats.split(',') if f.strip()]
|
|
|
|
if not source_dir.is_dir():
|
|
logger.error(f"Source directory does not exist or is not a directory: {source_dir}")
|
|
return 1
|
|
|
|
try:
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
except OSError as e:
|
|
logger.error(f"Could not create output directory '{output_dir}': {e}")
|
|
return 1
|
|
|
|
if not args.live:
|
|
logger.info(f"Scanning for info.json files in '{source_dir}' (recursively)...")
|
|
source_files = list(source_dir.rglob('*.json'))
|
|
|
|
if not source_files:
|
|
logger.info(f"No .json files found in '{source_dir}'. Nothing to do.")
|
|
return 0
|
|
|
|
logger.info(f"Found {len(source_files)} source file(s). Generating tasks for formats: {', '.join(formats_to_generate)}...")
|
|
|
|
total_tasks_generated = 0
|
|
for source_file in source_files:
|
|
tasks_for_file = _generate_tasks_for_file(source_file, output_dir, formats_to_generate, args.dummy)
|
|
total_tasks_generated += tasks_for_file
|
|
|
|
logger.info(f"Successfully generated {total_tasks_generated} new task file(s) in '{output_dir}'.")
|
|
return 0
|
|
|
|
# --- Live Mode ---
|
|
logger.info(f"Running in LIVE mode. Watching '{source_dir}' for new files every {args.interval_seconds}s. Press Ctrl+C to stop.")
|
|
total_tasks_generated = 0
|
|
|
|
while not shutdown_event:
|
|
try:
|
|
logger.debug("Live mode: Scanning for new source files...")
|
|
source_files = list(source_dir.rglob('*.json'))
|
|
|
|
if not source_files:
|
|
logger.debug("Live mode: No source files found.")
|
|
else:
|
|
logger.info(f"Live mode: Found {len(source_files)} source file(s) to process.")
|
|
for source_file in source_files:
|
|
if shutdown_event: break
|
|
tasks_for_file = _generate_tasks_for_file(source_file, output_dir, formats_to_generate, args.dummy)
|
|
total_tasks_generated += tasks_for_file
|
|
|
|
if shutdown_event: break
|
|
|
|
logger.debug(f"Live mode: Scan complete. Sleeping for {args.interval_seconds}s...")
|
|
time.sleep(args.interval_seconds)
|
|
|
|
except Exception as e:
|
|
logger.error(f"An unexpected error occurred in the live loop: {e}", exc_info=True)
|
|
time.sleep(5) # Pause before retrying to avoid spamming errors
|
|
|
|
logger.info(f"Task generator stopped. Total tasks generated in this run: {total_tasks_generated}.")
|
|
return 0
|