yt-dlp-dags/dags/ytdlp_mgmt_queues.py

494 lines
20 KiB
Python

# -*- coding: utf-8 -*-
"""
Airflow DAG for manually adding YouTube URLs or Video IDs to a Redis queue.
"""
from __future__ import annotations
import json
import logging
import re
from typing import List, Optional
import csv
import os
from datetime import datetime
from airflow.exceptions import AirflowException
from airflow.models.dag import DAG
from airflow.models.param import Param
from airflow.operators.python import PythonOperator, BranchPythonOperator
from airflow.operators.empty import EmptyOperator
from airflow.providers.redis.hooks.redis import RedisHook
from airflow.utils.dates import days_ago
# Configure logging
logger = logging.getLogger(__name__)
# Default settings
DEFAULT_REDIS_CONN_ID = "redis_default"
DEFAULT_QUEUE_NAME = "video_queue"
DEFAULT_QUEUE_TO_CLEAR = 'PLEASE_SPECIFY_QUEUE_TO_CLEAR'
# --- Helper Functions ---
def _get_redis_client(redis_conn_id: str):
"""Gets a Redis client from an Airflow connection."""
try:
redis_hook = RedisHook(redis_conn_id=redis_conn_id)
return redis_hook.get_conn()
except Exception as e:
logger.error(f"Failed to connect to Redis using connection '{redis_conn_id}': {e}")
raise AirflowException(f"Redis connection failed: {e}")
def parse_video_inputs(input_str: str) -> List[str]:
"""Parses a flexible string of video inputs into a list of individual items."""
if not input_str or not isinstance(input_str, str):
return []
input_str = input_str.strip()
# 1. Try to parse as a JSON array
if input_str.startswith("[") and input_str.endswith("]"):
try:
items = json.loads(input_str)
if isinstance(items, list):
logger.info("Successfully parsed input as a JSON array.")
return [str(item).strip() for item in items]
except json.JSONDecodeError:
logger.warning("Input looked like a JSON array but failed to parse. Treating as a comma-separated string.")
# 2. Treat as a comma-separated string
items = [item.strip() for item in input_str.split(",")]
# 3. Clean up quotes and extra whitespace from each item
cleaned_items = []
for item in items:
if item.startswith(('"', "'")) and item.endswith(('"', "'")):
item = item[1:-1]
if item: # Only add non-empty items
cleaned_items.append(item.strip())
return cleaned_items
def normalize_to_url(item: str) -> Optional[str]:
"""
Validates if an item is a recognizable YouTube URL or video ID,
and normalizes it to a standard watch URL format.
"""
if not item:
return None
# Regex for a standard 11-character YouTube video ID
video_id_pattern = r"^[a-zA-Z0-9_-]{11}$"
# Check if the item itself is a video ID
if re.match(video_id_pattern, item):
video_id = item
return f"https://www.youtube.com/watch?v={video_id}"
# Comprehensive regex to extract video ID from various URL formats
# Covers: watch, youtu.be, shorts, embed, /v/
url_patterns = [
r"(?:v=|\/v\/|youtu\.be\/|embed\/|shorts\/)([a-zA-Z0-9_-]{11})"
]
for pattern in url_patterns:
match = re.search(pattern, item)
if match:
video_id = match.group(1)
return f"https://www.youtube.com/watch?v={video_id}"
logger.warning(f"Could not recognize '{item}' as a valid YouTube URL or video ID.")
return None
def dump_redis_data_to_csv(redis_client, dump_dir, patterns):
"""Dumps data from Redis keys matching patterns to separate CSV files in a timestamped directory."""
timestamp_dir = datetime.now().strftime('%Y%m%d_%H%M%S')
full_dump_path = os.path.join(dump_dir, timestamp_dir)
os.makedirs(full_dump_path, exist_ok=True)
logger.info(f"Created dump directory: {full_dump_path}")
for pattern in patterns:
if not pattern: continue
# Sanitize pattern for filename
sanitized_pattern = re.sub(r'[^a-zA-Z0-9_-]', '_', pattern)
timestamp_file = datetime.now().strftime('%Y%m%d')
dump_file_name = f'redis_dump_{sanitized_pattern}_{timestamp_file}.csv'
dump_file_path = os.path.join(full_dump_path, dump_file_name)
logger.info(f"Dumping keys matching '{pattern}' to {dump_file_path}")
try:
with open(dump_file_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['key', 'type', 'field_or_index', 'value'])
keys_found = 0
for key_bytes in redis_client.scan_iter(pattern):
key = key_bytes.decode('utf-8')
keys_found += 1
key_type = redis_client.type(key).decode('utf-8')
if key_type == 'hash':
for field, value in redis_client.hgetall(key).items():
writer.writerow([key, key_type, field.decode('utf-8'), value.decode('utf-8')])
elif key_type == 'list':
for index, value in enumerate(redis_client.lrange(key, 0, -1)):
writer.writerow([key, key_type, index, value.decode('utf-8')])
elif key_type == 'set':
for member in redis_client.smembers(key):
writer.writerow([key, key_type, None, member.decode('utf-8')])
elif key_type == 'string':
value = redis_client.get(key)
if value:
writer.writerow([key, key_type, None, value.decode('utf-8')])
if keys_found > 0:
logger.info(f"Successfully dumped {keys_found} keys for pattern '{pattern}' to {dump_file_path}")
else:
logger.info(f"No keys found for pattern '{pattern}'. Empty CSV file created at {dump_file_path}")
except Exception as e:
logger.error(f"Failed to dump Redis data for pattern '{pattern}': {e}", exc_info=True)
raise AirflowException(f"Failed to dump Redis data for pattern '{pattern}': {e}")
def clear_queue_callable(**context):
"""Dumps Redis data to CSV and/or clears a specified Redis key."""
params = context['params']
redis_conn_id = params['redis_conn_id']
queue_to_clear = params['queue_to_clear']
dump_queues = params['dump_queues']
# Get the rendered dump_dir from the templates_dict passed to the operator
dump_dir = context['templates_dict']['dump_dir']
dump_patterns = params['dump_patterns'].split(',') if params.get('dump_patterns') else []
redis_client = _get_redis_client(redis_conn_id)
if dump_queues and dump_patterns:
dump_redis_data_to_csv(redis_client, dump_dir, dump_patterns)
if not queue_to_clear or queue_to_clear == DEFAULT_QUEUE_TO_CLEAR:
logger.info("Parameter 'queue_to_clear' is not specified or is the default placeholder. Skipping key deletion.")
# If we only wanted to dump, this is a success.
return
logger.info(f"Attempting to clear Redis key '{queue_to_clear}' using connection '{redis_conn_id}'.")
try:
deleted_count = redis_client.delete(queue_to_clear)
if deleted_count > 0:
logger.info(f"Successfully cleared Redis key '{queue_to_clear}'.")
else:
logger.info(f"Redis key '{queue_to_clear}' did not exist or was already empty.")
except Exception as e:
logger.error(f"Failed to clear Redis key '{queue_to_clear}': {e}", exc_info=True)
raise AirflowException(f"Failed to clear Redis key: {e}")
def list_contents_callable(**context):
"""Lists the contents of the specified Redis key (list or hash)."""
params = context['params']
redis_conn_id = params['redis_conn_id']
queue_to_list = params['queue_to_list']
max_items = params.get('max_items', 10)
if not queue_to_list:
raise ValueError("Parameter 'queue_to_list' cannot be empty.")
logger.info(f"Attempting to list contents of Redis key '{queue_to_list}' (max: {max_items}) using connection '{redis_conn_id}'.")
try:
redis_client = _get_redis_client(redis_conn_id)
key_type_bytes = redis_client.type(queue_to_list)
key_type = key_type_bytes.decode('utf-8') # Decode type
if key_type == 'list':
list_length = redis_client.llen(queue_to_list)
# Get the last N items, which are the most recently added with rpush
items_to_fetch = min(max_items, list_length)
# lrange with negative indices gets items from the end of the list.
# -N to -1 gets the last N items.
contents_bytes = redis_client.lrange(queue_to_list, -items_to_fetch, -1)
contents = [item.decode('utf-8') for item in contents_bytes]
# Reverse the list so the absolute most recent item is printed first
contents.reverse()
logger.info(f"--- Contents of Redis List '{queue_to_list}' (showing most recent {len(contents)} of {list_length}) ---")
for i, item in enumerate(contents):
# The index here is just for display, 0 is the most recent
logger.info(f" [recent_{i}]: {item}")
if list_length > len(contents):
logger.info(f" ... ({list_length - len(contents)} older items not shown)")
logger.info(f"--- End of List Contents ---")
elif key_type == 'hash':
hash_size = redis_client.hlen(queue_to_list)
# HGETALL can be risky for large hashes. Consider HSCAN for production.
# For manual inspection, HGETALL is often acceptable.
if hash_size > max_items * 2: # Heuristic: avoid huge HGETALL
logger.warning(f"Hash '{queue_to_list}' has {hash_size} fields, which is large. Listing might be slow or incomplete. Consider using redis-cli HSCAN.")
# hgetall returns dict of bytes keys and bytes values, decode them
contents_bytes = redis_client.hgetall(queue_to_list)
contents = {k.decode('utf-8'): v.decode('utf-8') for k, v in contents_bytes.items()}
logger.info(f"--- Contents of Redis Hash '{queue_to_list}' ({len(contents)} fields) ---")
item_count = 0
for key, value in contents.items(): # key and value are now strings
if item_count >= max_items:
logger.info(f" ... (stopped listing after {max_items} items of {hash_size})")
break
# Attempt to pretty-print if value is JSON
try:
parsed_value = json.loads(value)
pretty_value = json.dumps(parsed_value, indent=2)
logger.info(f" '{key}':\n{pretty_value}")
except json.JSONDecodeError:
logger.info(f" '{key}': {value}") # Print as string if not JSON
item_count += 1
logger.info(f"--- End of Hash Contents ---")
elif key_type == 'none':
logger.info(f"Redis key '{queue_to_list}' does not exist.")
else:
logger.info(f"Redis key '{queue_to_list}' is of type '{key_type}'. Listing contents for this type is not implemented.")
except Exception as e:
logger.error(f"Failed to list contents of Redis key '{queue_to_list}': {e}", exc_info=True)
raise AirflowException(f"Failed to list Redis key contents: {e}")
def check_status_callable(**context):
"""Checks the status (type and size) of all standard Redis queues for a given base name."""
params = context['params']
redis_conn_id = params['redis_conn_id']
queue_name = params.get('queue_name_for_status', DEFAULT_QUEUE_NAME)
queue_suffixes = ['_inbox', '_progress', '_result', '_fail']
logger.info(f"--- Checking Status for Queues with Base Name: '{queue_name}' ---")
try:
redis_client = _get_redis_client(redis_conn_id)
for suffix in queue_suffixes:
queue_to_check = f"{queue_name}{suffix}"
key_type = redis_client.type(queue_to_check).decode('utf-8')
size = 0
if key_type == 'list':
size = redis_client.llen(queue_to_check)
elif key_type == 'hash':
size = redis_client.hlen(queue_to_check)
if key_type != 'none':
logger.info(f" - Queue '{queue_to_check}': Type='{key_type.upper()}', Size={size}")
else:
logger.info(f" - Queue '{queue_to_check}': Does not exist.")
logger.info(f"--- End of Status Check ---")
except Exception as e:
logger.error(f"Failed to check queue status for base name '{queue_name}': {e}", exc_info=True)
raise AirflowException(f"Failed to check queue status: {e}")
def add_videos_to_queue_callable(**context):
"""
Parses video inputs, normalizes them to URLs, and adds them to a Redis queue.
"""
params = context["params"]
video_inputs = params["video_inputs"]
queue_name = params["queue_name"]
redis_conn_id = params["redis_conn_id"]
dry_run = params["dry_run"]
if not video_inputs:
logger.info("No video inputs provided. Nothing to do.")
print("No video inputs provided. Nothing to do.")
return
raw_items = parse_video_inputs(video_inputs)
if not raw_items:
logger.info("Input string was empty or contained no items after parsing.")
print("Input string was empty or contained no items after parsing.")
return
valid_urls = []
for item in raw_items:
url = normalize_to_url(item)
if url and url not in valid_urls:
valid_urls.append(url)
elif not url:
logger.warning(f"Skipping invalid input item: '{item}'")
if not valid_urls:
raise AirflowException("No valid YouTube URLs or IDs were found in the provided input.")
logger.info(f"Found {len(valid_urls)} valid and unique URLs to add to the queue:")
print(f"Found {len(valid_urls)} valid and unique URLs to add to the queue:")
for url in valid_urls:
logger.info(f" - {url}")
print(f" - {url}")
if dry_run:
logger.info("Dry run is enabled. Skipping Redis operation.")
print(f"\n[DRY RUN] Would have added {len(valid_urls)} URLs to the Redis list '{queue_name}_inbox'.")
return
# --- Add to Redis ---
try:
redis_client = _get_redis_client(redis_conn_id)
inbox_queue = f"{queue_name}_inbox"
# Use a pipeline for atomic and efficient addition
with redis_client.pipeline() as pipe:
for url in valid_urls:
pipe.rpush(inbox_queue, url)
pipe.execute()
final_list_length = redis_client.llen(inbox_queue)
success_message = (
f"Successfully added {len(valid_urls)} URLs to Redis list '{inbox_queue}'. "
f"The list now contains {final_list_length} items."
)
logger.info(success_message)
print(f"\n{success_message}")
except Exception as e:
logger.error(f"Failed to add URLs to Redis queue '{inbox_queue}': {e}", exc_info=True)
raise AirflowException(f"Failed to add URLs to Redis: {e}")
# --- DAG Definition ---
with DAG(
dag_id="ytdlp_mgmt_queues",
default_args={
"owner": "airflow",
"start_date": days_ago(1),
"retries": 0,
},
schedule=None,
catchup=False,
tags=["ytdlp", "queue", "management", "redis", "manual"],
doc_md="""
### YT-DLP Queue Management
This DAG provides a set of tools to manage Redis queues used by the YTDLP processing pipeline.
Select an `action` to perform when triggering the DAG.
**Actions:**
- `add_videos`: Add one or more YouTube videos to a queue.
- `clear_queue`: Dump and/or delete a specific Redis key.
- `list_contents`: View the contents of a Redis key (list or hash).
- `check_status`: (Placeholder) Check the overall status of the queues.
""",
params={
"action": Param(
"add_videos",
type="string",
enum=["add_videos", "clear_queue", "list_contents", "check_status"],
title="Action",
description="The management action to perform.",
),
# --- Params for 'add_videos' ---
"video_inputs": Param(
None,
type=["null", "string"],
title="[add_videos] Video URLs or IDs",
description="A single item, comma-separated list, or JSON array of YouTube URLs or Video IDs.",
),
"queue_name": Param(
DEFAULT_QUEUE_NAME,
type="string",
title="[add_videos] Queue Name",
description="The base name of the Redis queue to add videos to (e.g., 'video_queue').",
),
"dry_run": Param(
False,
type="boolean",
title="[add_videos] Dry Run",
description="If True, validate inputs without adding them to the queue.",
),
# --- Params for 'clear_queue' ---
"queue_to_clear": Param(
DEFAULT_QUEUE_TO_CLEAR,
type="string",
title="[clear_queue] Queue to Clear",
description="Exact name of the Redis key to delete.",
),
"dump_queues": Param(
True,
type="boolean",
title="[clear_queue] Dump Data",
description="If True, dump data before clearing.",
),
"dump_dir": Param(
"{{ var.value.get('YTDLP_REDIS_DUMP_DIR', '/opt/airflow/dumps') }}",
type="string",
title="[clear_queue] Dump Directory",
description="Base directory to save CSV dump files.",
),
"dump_patterns": Param(
'ytdlp:*,video_queue_*',
type="string",
title="[clear_queue] Dump Patterns",
description="Comma-separated list of key patterns to dump.",
),
# --- Params for 'list_contents' ---
"queue_to_list": Param(
'video_queue_inbox',
type="string",
title="[list_contents] Queue to List",
description="Exact name of the Redis key to list.",
),
"max_items": Param(
10,
type="integer",
title="[list_contents] Max Items to List",
description="Maximum number of items to show.",
),
# --- Params for 'check_status' ---
"queue_name_for_status": Param(
DEFAULT_QUEUE_NAME,
type="string",
title="[check_status] Base Queue Name",
description="Base name of the queues to check (e.g., 'video_queue').",
),
# --- Common Params ---
"redis_conn_id": Param(
DEFAULT_REDIS_CONN_ID,
type="string",
title="Redis Connection ID",
),
},
) as dag:
branch_on_action = BranchPythonOperator(
task_id="branch_on_action",
python_callable=lambda **context: f"action_{context['params']['action']}",
)
action_add_videos = PythonOperator(
task_id="action_add_videos",
python_callable=add_videos_to_queue_callable,
)
action_clear_queue = PythonOperator(
task_id="action_clear_queue",
python_callable=clear_queue_callable,
templates_dict={'dump_dir': "{{ params.dump_dir }}"},
)
action_list_contents = PythonOperator(
task_id="action_list_contents",
python_callable=list_contents_callable,
)
action_check_status = PythonOperator(
task_id="action_check_status",
python_callable=check_status_callable,
)
# --- Placeholder Tasks ---
branch_on_action >> [action_add_videos, action_clear_queue, action_list_contents, action_check_status]