494 lines
20 KiB
Python
494 lines
20 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Airflow DAG for manually adding YouTube URLs or Video IDs to a Redis queue.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from typing import List, Optional
|
|
import csv
|
|
import os
|
|
from datetime import datetime
|
|
|
|
from airflow.exceptions import AirflowException
|
|
from airflow.models.dag import DAG
|
|
from airflow.models.param import Param
|
|
from airflow.operators.python import PythonOperator, BranchPythonOperator
|
|
from airflow.operators.empty import EmptyOperator
|
|
from airflow.providers.redis.hooks.redis import RedisHook
|
|
from airflow.utils.dates import days_ago
|
|
|
|
# Configure logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Default settings
|
|
DEFAULT_REDIS_CONN_ID = "redis_default"
|
|
DEFAULT_QUEUE_NAME = "video_queue"
|
|
DEFAULT_QUEUE_TO_CLEAR = 'PLEASE_SPECIFY_QUEUE_TO_CLEAR'
|
|
|
|
|
|
# --- Helper Functions ---
|
|
|
|
def _get_redis_client(redis_conn_id: str):
|
|
"""Gets a Redis client from an Airflow connection."""
|
|
try:
|
|
redis_hook = RedisHook(redis_conn_id=redis_conn_id)
|
|
return redis_hook.get_conn()
|
|
except Exception as e:
|
|
logger.error(f"Failed to connect to Redis using connection '{redis_conn_id}': {e}")
|
|
raise AirflowException(f"Redis connection failed: {e}")
|
|
|
|
|
|
def parse_video_inputs(input_str: str) -> List[str]:
|
|
"""Parses a flexible string of video inputs into a list of individual items."""
|
|
if not input_str or not isinstance(input_str, str):
|
|
return []
|
|
|
|
input_str = input_str.strip()
|
|
|
|
# 1. Try to parse as a JSON array
|
|
if input_str.startswith("[") and input_str.endswith("]"):
|
|
try:
|
|
items = json.loads(input_str)
|
|
if isinstance(items, list):
|
|
logger.info("Successfully parsed input as a JSON array.")
|
|
return [str(item).strip() for item in items]
|
|
except json.JSONDecodeError:
|
|
logger.warning("Input looked like a JSON array but failed to parse. Treating as a comma-separated string.")
|
|
|
|
# 2. Treat as a comma-separated string
|
|
items = [item.strip() for item in input_str.split(",")]
|
|
|
|
# 3. Clean up quotes and extra whitespace from each item
|
|
cleaned_items = []
|
|
for item in items:
|
|
if item.startswith(('"', "'")) and item.endswith(('"', "'")):
|
|
item = item[1:-1]
|
|
if item: # Only add non-empty items
|
|
cleaned_items.append(item.strip())
|
|
|
|
return cleaned_items
|
|
|
|
|
|
def normalize_to_url(item: str) -> Optional[str]:
|
|
"""
|
|
Validates if an item is a recognizable YouTube URL or video ID,
|
|
and normalizes it to a standard watch URL format.
|
|
"""
|
|
if not item:
|
|
return None
|
|
|
|
# Regex for a standard 11-character YouTube video ID
|
|
video_id_pattern = r"^[a-zA-Z0-9_-]{11}$"
|
|
|
|
# Check if the item itself is a video ID
|
|
if re.match(video_id_pattern, item):
|
|
video_id = item
|
|
return f"https://www.youtube.com/watch?v={video_id}"
|
|
|
|
# Comprehensive regex to extract video ID from various URL formats
|
|
# Covers: watch, youtu.be, shorts, embed, /v/
|
|
url_patterns = [
|
|
r"(?:v=|\/v\/|youtu\.be\/|embed\/|shorts\/)([a-zA-Z0-9_-]{11})"
|
|
]
|
|
for pattern in url_patterns:
|
|
match = re.search(pattern, item)
|
|
if match:
|
|
video_id = match.group(1)
|
|
return f"https://www.youtube.com/watch?v={video_id}"
|
|
|
|
logger.warning(f"Could not recognize '{item}' as a valid YouTube URL or video ID.")
|
|
return None
|
|
|
|
|
|
def dump_redis_data_to_csv(redis_client, dump_dir, patterns):
|
|
"""Dumps data from Redis keys matching patterns to separate CSV files in a timestamped directory."""
|
|
timestamp_dir = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
full_dump_path = os.path.join(dump_dir, timestamp_dir)
|
|
|
|
os.makedirs(full_dump_path, exist_ok=True)
|
|
logger.info(f"Created dump directory: {full_dump_path}")
|
|
|
|
for pattern in patterns:
|
|
if not pattern: continue
|
|
|
|
# Sanitize pattern for filename
|
|
sanitized_pattern = re.sub(r'[^a-zA-Z0-9_-]', '_', pattern)
|
|
timestamp_file = datetime.now().strftime('%Y%m%d')
|
|
dump_file_name = f'redis_dump_{sanitized_pattern}_{timestamp_file}.csv'
|
|
dump_file_path = os.path.join(full_dump_path, dump_file_name)
|
|
|
|
logger.info(f"Dumping keys matching '{pattern}' to {dump_file_path}")
|
|
|
|
try:
|
|
with open(dump_file_path, 'w', newline='', encoding='utf-8') as csvfile:
|
|
writer = csv.writer(csvfile)
|
|
writer.writerow(['key', 'type', 'field_or_index', 'value'])
|
|
|
|
keys_found = 0
|
|
for key_bytes in redis_client.scan_iter(pattern):
|
|
key = key_bytes.decode('utf-8')
|
|
keys_found += 1
|
|
key_type = redis_client.type(key).decode('utf-8')
|
|
|
|
if key_type == 'hash':
|
|
for field, value in redis_client.hgetall(key).items():
|
|
writer.writerow([key, key_type, field.decode('utf-8'), value.decode('utf-8')])
|
|
elif key_type == 'list':
|
|
for index, value in enumerate(redis_client.lrange(key, 0, -1)):
|
|
writer.writerow([key, key_type, index, value.decode('utf-8')])
|
|
elif key_type == 'set':
|
|
for member in redis_client.smembers(key):
|
|
writer.writerow([key, key_type, None, member.decode('utf-8')])
|
|
elif key_type == 'string':
|
|
value = redis_client.get(key)
|
|
if value:
|
|
writer.writerow([key, key_type, None, value.decode('utf-8')])
|
|
|
|
if keys_found > 0:
|
|
logger.info(f"Successfully dumped {keys_found} keys for pattern '{pattern}' to {dump_file_path}")
|
|
else:
|
|
logger.info(f"No keys found for pattern '{pattern}'. Empty CSV file created at {dump_file_path}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to dump Redis data for pattern '{pattern}': {e}", exc_info=True)
|
|
raise AirflowException(f"Failed to dump Redis data for pattern '{pattern}': {e}")
|
|
|
|
|
|
def clear_queue_callable(**context):
|
|
"""Dumps Redis data to CSV and/or clears a specified Redis key."""
|
|
params = context['params']
|
|
redis_conn_id = params['redis_conn_id']
|
|
queue_to_clear = params['queue_to_clear']
|
|
dump_queues = params['dump_queues']
|
|
# Get the rendered dump_dir from the templates_dict passed to the operator
|
|
dump_dir = context['templates_dict']['dump_dir']
|
|
dump_patterns = params['dump_patterns'].split(',') if params.get('dump_patterns') else []
|
|
|
|
redis_client = _get_redis_client(redis_conn_id)
|
|
|
|
if dump_queues and dump_patterns:
|
|
dump_redis_data_to_csv(redis_client, dump_dir, dump_patterns)
|
|
|
|
if not queue_to_clear or queue_to_clear == DEFAULT_QUEUE_TO_CLEAR:
|
|
logger.info("Parameter 'queue_to_clear' is not specified or is the default placeholder. Skipping key deletion.")
|
|
# If we only wanted to dump, this is a success.
|
|
return
|
|
|
|
logger.info(f"Attempting to clear Redis key '{queue_to_clear}' using connection '{redis_conn_id}'.")
|
|
try:
|
|
deleted_count = redis_client.delete(queue_to_clear)
|
|
if deleted_count > 0:
|
|
logger.info(f"Successfully cleared Redis key '{queue_to_clear}'.")
|
|
else:
|
|
logger.info(f"Redis key '{queue_to_clear}' did not exist or was already empty.")
|
|
except Exception as e:
|
|
logger.error(f"Failed to clear Redis key '{queue_to_clear}': {e}", exc_info=True)
|
|
raise AirflowException(f"Failed to clear Redis key: {e}")
|
|
|
|
|
|
def list_contents_callable(**context):
|
|
"""Lists the contents of the specified Redis key (list or hash)."""
|
|
params = context['params']
|
|
redis_conn_id = params['redis_conn_id']
|
|
queue_to_list = params['queue_to_list']
|
|
max_items = params.get('max_items', 10)
|
|
|
|
if not queue_to_list:
|
|
raise ValueError("Parameter 'queue_to_list' cannot be empty.")
|
|
|
|
logger.info(f"Attempting to list contents of Redis key '{queue_to_list}' (max: {max_items}) using connection '{redis_conn_id}'.")
|
|
try:
|
|
redis_client = _get_redis_client(redis_conn_id)
|
|
key_type_bytes = redis_client.type(queue_to_list)
|
|
key_type = key_type_bytes.decode('utf-8') # Decode type
|
|
|
|
if key_type == 'list':
|
|
list_length = redis_client.llen(queue_to_list)
|
|
# Get the last N items, which are the most recently added with rpush
|
|
items_to_fetch = min(max_items, list_length)
|
|
# lrange with negative indices gets items from the end of the list.
|
|
# -N to -1 gets the last N items.
|
|
contents_bytes = redis_client.lrange(queue_to_list, -items_to_fetch, -1)
|
|
contents = [item.decode('utf-8') for item in contents_bytes]
|
|
# Reverse the list so the absolute most recent item is printed first
|
|
contents.reverse()
|
|
logger.info(f"--- Contents of Redis List '{queue_to_list}' (showing most recent {len(contents)} of {list_length}) ---")
|
|
for i, item in enumerate(contents):
|
|
# The index here is just for display, 0 is the most recent
|
|
logger.info(f" [recent_{i}]: {item}")
|
|
if list_length > len(contents):
|
|
logger.info(f" ... ({list_length - len(contents)} older items not shown)")
|
|
logger.info(f"--- End of List Contents ---")
|
|
|
|
elif key_type == 'hash':
|
|
hash_size = redis_client.hlen(queue_to_list)
|
|
# HGETALL can be risky for large hashes. Consider HSCAN for production.
|
|
# For manual inspection, HGETALL is often acceptable.
|
|
if hash_size > max_items * 2: # Heuristic: avoid huge HGETALL
|
|
logger.warning(f"Hash '{queue_to_list}' has {hash_size} fields, which is large. Listing might be slow or incomplete. Consider using redis-cli HSCAN.")
|
|
# hgetall returns dict of bytes keys and bytes values, decode them
|
|
contents_bytes = redis_client.hgetall(queue_to_list)
|
|
contents = {k.decode('utf-8'): v.decode('utf-8') for k, v in contents_bytes.items()}
|
|
logger.info(f"--- Contents of Redis Hash '{queue_to_list}' ({len(contents)} fields) ---")
|
|
item_count = 0
|
|
for key, value in contents.items(): # key and value are now strings
|
|
if item_count >= max_items:
|
|
logger.info(f" ... (stopped listing after {max_items} items of {hash_size})")
|
|
break
|
|
# Attempt to pretty-print if value is JSON
|
|
try:
|
|
parsed_value = json.loads(value)
|
|
pretty_value = json.dumps(parsed_value, indent=2)
|
|
logger.info(f" '{key}':\n{pretty_value}")
|
|
except json.JSONDecodeError:
|
|
logger.info(f" '{key}': {value}") # Print as string if not JSON
|
|
item_count += 1
|
|
logger.info(f"--- End of Hash Contents ---")
|
|
|
|
elif key_type == 'none':
|
|
logger.info(f"Redis key '{queue_to_list}' does not exist.")
|
|
else:
|
|
logger.info(f"Redis key '{queue_to_list}' is of type '{key_type}'. Listing contents for this type is not implemented.")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to list contents of Redis key '{queue_to_list}': {e}", exc_info=True)
|
|
raise AirflowException(f"Failed to list Redis key contents: {e}")
|
|
|
|
|
|
def check_status_callable(**context):
|
|
"""Checks the status (type and size) of all standard Redis queues for a given base name."""
|
|
params = context['params']
|
|
redis_conn_id = params['redis_conn_id']
|
|
queue_name = params.get('queue_name_for_status', DEFAULT_QUEUE_NAME)
|
|
queue_suffixes = ['_inbox', '_progress', '_result', '_fail']
|
|
|
|
logger.info(f"--- Checking Status for Queues with Base Name: '{queue_name}' ---")
|
|
|
|
try:
|
|
redis_client = _get_redis_client(redis_conn_id)
|
|
|
|
for suffix in queue_suffixes:
|
|
queue_to_check = f"{queue_name}{suffix}"
|
|
key_type = redis_client.type(queue_to_check).decode('utf-8')
|
|
size = 0
|
|
if key_type == 'list':
|
|
size = redis_client.llen(queue_to_check)
|
|
elif key_type == 'hash':
|
|
size = redis_client.hlen(queue_to_check)
|
|
|
|
if key_type != 'none':
|
|
logger.info(f" - Queue '{queue_to_check}': Type='{key_type.upper()}', Size={size}")
|
|
else:
|
|
logger.info(f" - Queue '{queue_to_check}': Does not exist.")
|
|
|
|
logger.info(f"--- End of Status Check ---")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to check queue status for base name '{queue_name}': {e}", exc_info=True)
|
|
raise AirflowException(f"Failed to check queue status: {e}")
|
|
|
|
|
|
def add_videos_to_queue_callable(**context):
|
|
"""
|
|
Parses video inputs, normalizes them to URLs, and adds them to a Redis queue.
|
|
"""
|
|
params = context["params"]
|
|
video_inputs = params["video_inputs"]
|
|
queue_name = params["queue_name"]
|
|
redis_conn_id = params["redis_conn_id"]
|
|
dry_run = params["dry_run"]
|
|
|
|
if not video_inputs:
|
|
logger.info("No video inputs provided. Nothing to do.")
|
|
print("No video inputs provided. Nothing to do.")
|
|
return
|
|
|
|
raw_items = parse_video_inputs(video_inputs)
|
|
if not raw_items:
|
|
logger.info("Input string was empty or contained no items after parsing.")
|
|
print("Input string was empty or contained no items after parsing.")
|
|
return
|
|
|
|
valid_urls = []
|
|
for item in raw_items:
|
|
url = normalize_to_url(item)
|
|
if url and url not in valid_urls:
|
|
valid_urls.append(url)
|
|
elif not url:
|
|
logger.warning(f"Skipping invalid input item: '{item}'")
|
|
|
|
if not valid_urls:
|
|
raise AirflowException("No valid YouTube URLs or IDs were found in the provided input.")
|
|
|
|
logger.info(f"Found {len(valid_urls)} valid and unique URLs to add to the queue:")
|
|
print(f"Found {len(valid_urls)} valid and unique URLs to add to the queue:")
|
|
for url in valid_urls:
|
|
logger.info(f" - {url}")
|
|
print(f" - {url}")
|
|
|
|
if dry_run:
|
|
logger.info("Dry run is enabled. Skipping Redis operation.")
|
|
print(f"\n[DRY RUN] Would have added {len(valid_urls)} URLs to the Redis list '{queue_name}_inbox'.")
|
|
return
|
|
|
|
# --- Add to Redis ---
|
|
try:
|
|
redis_client = _get_redis_client(redis_conn_id)
|
|
inbox_queue = f"{queue_name}_inbox"
|
|
|
|
# Use a pipeline for atomic and efficient addition
|
|
with redis_client.pipeline() as pipe:
|
|
for url in valid_urls:
|
|
pipe.rpush(inbox_queue, url)
|
|
pipe.execute()
|
|
|
|
final_list_length = redis_client.llen(inbox_queue)
|
|
|
|
success_message = (
|
|
f"Successfully added {len(valid_urls)} URLs to Redis list '{inbox_queue}'. "
|
|
f"The list now contains {final_list_length} items."
|
|
)
|
|
logger.info(success_message)
|
|
print(f"\n{success_message}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to add URLs to Redis queue '{inbox_queue}': {e}", exc_info=True)
|
|
raise AirflowException(f"Failed to add URLs to Redis: {e}")
|
|
|
|
|
|
# --- DAG Definition ---
|
|
with DAG(
|
|
dag_id="ytdlp_mgmt_queues",
|
|
default_args={
|
|
"owner": "airflow",
|
|
"start_date": days_ago(1),
|
|
"retries": 0,
|
|
},
|
|
schedule=None,
|
|
catchup=False,
|
|
tags=["ytdlp", "queue", "management", "redis", "manual"],
|
|
doc_md="""
|
|
### YT-DLP Queue Management
|
|
|
|
This DAG provides a set of tools to manage Redis queues used by the YTDLP processing pipeline.
|
|
Select an `action` to perform when triggering the DAG.
|
|
|
|
**Actions:**
|
|
- `add_videos`: Add one or more YouTube videos to a queue.
|
|
- `clear_queue`: Dump and/or delete a specific Redis key.
|
|
- `list_contents`: View the contents of a Redis key (list or hash).
|
|
- `check_status`: (Placeholder) Check the overall status of the queues.
|
|
""",
|
|
params={
|
|
"action": Param(
|
|
"add_videos",
|
|
type="string",
|
|
enum=["add_videos", "clear_queue", "list_contents", "check_status"],
|
|
title="Action",
|
|
description="The management action to perform.",
|
|
),
|
|
# --- Params for 'add_videos' ---
|
|
"video_inputs": Param(
|
|
None,
|
|
type=["null", "string"],
|
|
title="[add_videos] Video URLs or IDs",
|
|
description="A single item, comma-separated list, or JSON array of YouTube URLs or Video IDs.",
|
|
),
|
|
"queue_name": Param(
|
|
DEFAULT_QUEUE_NAME,
|
|
type="string",
|
|
title="[add_videos] Queue Name",
|
|
description="The base name of the Redis queue to add videos to (e.g., 'video_queue').",
|
|
),
|
|
"dry_run": Param(
|
|
False,
|
|
type="boolean",
|
|
title="[add_videos] Dry Run",
|
|
description="If True, validate inputs without adding them to the queue.",
|
|
),
|
|
# --- Params for 'clear_queue' ---
|
|
"queue_to_clear": Param(
|
|
DEFAULT_QUEUE_TO_CLEAR,
|
|
type="string",
|
|
title="[clear_queue] Queue to Clear",
|
|
description="Exact name of the Redis key to delete.",
|
|
),
|
|
"dump_queues": Param(
|
|
True,
|
|
type="boolean",
|
|
title="[clear_queue] Dump Data",
|
|
description="If True, dump data before clearing.",
|
|
),
|
|
"dump_dir": Param(
|
|
"{{ var.value.get('YTDLP_REDIS_DUMP_DIR', '/opt/airflow/dumps') }}",
|
|
type="string",
|
|
title="[clear_queue] Dump Directory",
|
|
description="Base directory to save CSV dump files.",
|
|
),
|
|
"dump_patterns": Param(
|
|
'ytdlp:*,video_queue_*',
|
|
type="string",
|
|
title="[clear_queue] Dump Patterns",
|
|
description="Comma-separated list of key patterns to dump.",
|
|
),
|
|
# --- Params for 'list_contents' ---
|
|
"queue_to_list": Param(
|
|
'video_queue_inbox',
|
|
type="string",
|
|
title="[list_contents] Queue to List",
|
|
description="Exact name of the Redis key to list.",
|
|
),
|
|
"max_items": Param(
|
|
10,
|
|
type="integer",
|
|
title="[list_contents] Max Items to List",
|
|
description="Maximum number of items to show.",
|
|
),
|
|
# --- Params for 'check_status' ---
|
|
"queue_name_for_status": Param(
|
|
DEFAULT_QUEUE_NAME,
|
|
type="string",
|
|
title="[check_status] Base Queue Name",
|
|
description="Base name of the queues to check (e.g., 'video_queue').",
|
|
),
|
|
# --- Common Params ---
|
|
"redis_conn_id": Param(
|
|
DEFAULT_REDIS_CONN_ID,
|
|
type="string",
|
|
title="Redis Connection ID",
|
|
),
|
|
},
|
|
) as dag:
|
|
branch_on_action = BranchPythonOperator(
|
|
task_id="branch_on_action",
|
|
python_callable=lambda **context: f"action_{context['params']['action']}",
|
|
)
|
|
|
|
action_add_videos = PythonOperator(
|
|
task_id="action_add_videos",
|
|
python_callable=add_videos_to_queue_callable,
|
|
)
|
|
|
|
action_clear_queue = PythonOperator(
|
|
task_id="action_clear_queue",
|
|
python_callable=clear_queue_callable,
|
|
templates_dict={'dump_dir': "{{ params.dump_dir }}"},
|
|
)
|
|
|
|
action_list_contents = PythonOperator(
|
|
task_id="action_list_contents",
|
|
python_callable=list_contents_callable,
|
|
)
|
|
|
|
action_check_status = PythonOperator(
|
|
task_id="action_check_status",
|
|
python_callable=check_status_callable,
|
|
)
|
|
|
|
# --- Placeholder Tasks ---
|
|
branch_on_action >> [action_add_videos, action_clear_queue, action_list_contents, action_check_status]
|