# -*- coding: utf-8 -*- """ Airflow DAG for manually adding YouTube URLs or Video IDs to a Redis queue. """ from __future__ import annotations import json import logging import re from typing import List, Optional import csv import os from datetime import datetime from airflow.exceptions import AirflowException from airflow.models.dag import DAG from airflow.models.param import Param from airflow.operators.python import PythonOperator, BranchPythonOperator from airflow.operators.empty import EmptyOperator from airflow.providers.redis.hooks.redis import RedisHook from airflow.utils.dates import days_ago # Configure logging logger = logging.getLogger(__name__) # Default settings DEFAULT_REDIS_CONN_ID = "redis_default" DEFAULT_QUEUE_NAME = "video_queue" DEFAULT_QUEUE_TO_CLEAR = 'PLEASE_SPECIFY_QUEUE_TO_CLEAR' # --- Helper Functions --- def _get_redis_client(redis_conn_id: str): """Gets a Redis client from an Airflow connection.""" try: redis_hook = RedisHook(redis_conn_id=redis_conn_id) return redis_hook.get_conn() except Exception as e: logger.error(f"Failed to connect to Redis using connection '{redis_conn_id}': {e}") raise AirflowException(f"Redis connection failed: {e}") def parse_video_inputs(input_str: str) -> List[str]: """Parses a flexible string of video inputs into a list of individual items.""" if not input_str or not isinstance(input_str, str): return [] input_str = input_str.strip() # 1. Try to parse as a JSON array if input_str.startswith("[") and input_str.endswith("]"): try: items = json.loads(input_str) if isinstance(items, list): logger.info("Successfully parsed input as a JSON array.") return [str(item).strip() for item in items] except json.JSONDecodeError: logger.warning("Input looked like a JSON array but failed to parse. Treating as a comma-separated string.") # 2. Treat as a comma-separated string items = [item.strip() for item in input_str.split(",")] # 3. Clean up quotes and extra whitespace from each item cleaned_items = [] for item in items: if item.startswith(('"', "'")) and item.endswith(('"', "'")): item = item[1:-1] if item: # Only add non-empty items cleaned_items.append(item.strip()) return cleaned_items def normalize_to_url(item: str) -> Optional[str]: """ Validates if an item is a recognizable YouTube URL or video ID, and normalizes it to a standard watch URL format. """ if not item: return None # Regex for a standard 11-character YouTube video ID video_id_pattern = r"^[a-zA-Z0-9_-]{11}$" # Check if the item itself is a video ID if re.match(video_id_pattern, item): video_id = item return f"https://www.youtube.com/watch?v={video_id}" # Comprehensive regex to extract video ID from various URL formats # Covers: watch, youtu.be, shorts, embed, /v/ url_patterns = [ r"(?:v=|\/v\/|youtu\.be\/|embed\/|shorts\/)([a-zA-Z0-9_-]{11})" ] for pattern in url_patterns: match = re.search(pattern, item) if match: video_id = match.group(1) return f"https://www.youtube.com/watch?v={video_id}" logger.warning(f"Could not recognize '{item}' as a valid YouTube URL or video ID.") return None def dump_redis_data_to_csv(redis_client, dump_dir, patterns): """Dumps data from Redis keys matching patterns to separate CSV files in a timestamped directory.""" timestamp_dir = datetime.now().strftime('%Y%m%d_%H%M%S') full_dump_path = os.path.join(dump_dir, timestamp_dir) os.makedirs(full_dump_path, exist_ok=True) logger.info(f"Created dump directory: {full_dump_path}") for pattern in patterns: if not pattern: continue # Sanitize pattern for filename sanitized_pattern = re.sub(r'[^a-zA-Z0-9_-]', '_', pattern) timestamp_file = datetime.now().strftime('%Y%m%d') dump_file_name = f'redis_dump_{sanitized_pattern}_{timestamp_file}.csv' dump_file_path = os.path.join(full_dump_path, dump_file_name) logger.info(f"Dumping keys matching '{pattern}' to {dump_file_path}") try: with open(dump_file_path, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) writer.writerow(['key', 'type', 'field_or_index', 'value']) keys_found = 0 for key_bytes in redis_client.scan_iter(pattern): key = key_bytes.decode('utf-8') keys_found += 1 key_type = redis_client.type(key).decode('utf-8') if key_type == 'hash': for field, value in redis_client.hgetall(key).items(): writer.writerow([key, key_type, field.decode('utf-8'), value.decode('utf-8')]) elif key_type == 'list': for index, value in enumerate(redis_client.lrange(key, 0, -1)): writer.writerow([key, key_type, index, value.decode('utf-8')]) elif key_type == 'set': for member in redis_client.smembers(key): writer.writerow([key, key_type, None, member.decode('utf-8')]) elif key_type == 'string': value = redis_client.get(key) if value: writer.writerow([key, key_type, None, value.decode('utf-8')]) if keys_found > 0: logger.info(f"Successfully dumped {keys_found} keys for pattern '{pattern}' to {dump_file_path}") else: logger.info(f"No keys found for pattern '{pattern}'. Empty CSV file created at {dump_file_path}") except Exception as e: logger.error(f"Failed to dump Redis data for pattern '{pattern}': {e}", exc_info=True) raise AirflowException(f"Failed to dump Redis data for pattern '{pattern}': {e}") def clear_queue_callable(**context): """Dumps Redis data to CSV and/or clears a specified Redis key.""" params = context['params'] redis_conn_id = params['redis_conn_id'] queue_to_clear = params['queue_to_clear'] dump_queues = params['dump_queues'] # Get the rendered dump_dir from the templates_dict passed to the operator dump_dir = context['templates_dict']['dump_dir'] dump_patterns = params['dump_patterns'].split(',') if params.get('dump_patterns') else [] redis_client = _get_redis_client(redis_conn_id) if dump_queues and dump_patterns: dump_redis_data_to_csv(redis_client, dump_dir, dump_patterns) if not queue_to_clear or queue_to_clear == DEFAULT_QUEUE_TO_CLEAR: logger.info("Parameter 'queue_to_clear' is not specified or is the default placeholder. Skipping key deletion.") # If we only wanted to dump, this is a success. return logger.info(f"Attempting to clear Redis key '{queue_to_clear}' using connection '{redis_conn_id}'.") try: deleted_count = redis_client.delete(queue_to_clear) if deleted_count > 0: logger.info(f"Successfully cleared Redis key '{queue_to_clear}'.") else: logger.info(f"Redis key '{queue_to_clear}' did not exist or was already empty.") except Exception as e: logger.error(f"Failed to clear Redis key '{queue_to_clear}': {e}", exc_info=True) raise AirflowException(f"Failed to clear Redis key: {e}") def list_contents_callable(**context): """Lists the contents of the specified Redis key (list or hash).""" params = context['params'] redis_conn_id = params['redis_conn_id'] queue_to_list = params['queue_to_list'] max_items = params.get('max_items', 10) if not queue_to_list: raise ValueError("Parameter 'queue_to_list' cannot be empty.") logger.info(f"Attempting to list contents of Redis key '{queue_to_list}' (max: {max_items}) using connection '{redis_conn_id}'.") try: redis_client = _get_redis_client(redis_conn_id) key_type_bytes = redis_client.type(queue_to_list) key_type = key_type_bytes.decode('utf-8') # Decode type if key_type == 'list': list_length = redis_client.llen(queue_to_list) # Get the last N items, which are the most recently added with rpush items_to_fetch = min(max_items, list_length) # lrange with negative indices gets items from the end of the list. # -N to -1 gets the last N items. contents_bytes = redis_client.lrange(queue_to_list, -items_to_fetch, -1) contents = [item.decode('utf-8') for item in contents_bytes] # Reverse the list so the absolute most recent item is printed first contents.reverse() logger.info(f"--- Contents of Redis List '{queue_to_list}' (showing most recent {len(contents)} of {list_length}) ---") for i, item in enumerate(contents): # The index here is just for display, 0 is the most recent logger.info(f" [recent_{i}]: {item}") if list_length > len(contents): logger.info(f" ... ({list_length - len(contents)} older items not shown)") logger.info(f"--- End of List Contents ---") elif key_type == 'hash': hash_size = redis_client.hlen(queue_to_list) # HGETALL can be risky for large hashes. Consider HSCAN for production. # For manual inspection, HGETALL is often acceptable. if hash_size > max_items * 2: # Heuristic: avoid huge HGETALL logger.warning(f"Hash '{queue_to_list}' has {hash_size} fields, which is large. Listing might be slow or incomplete. Consider using redis-cli HSCAN.") # hgetall returns dict of bytes keys and bytes values, decode them contents_bytes = redis_client.hgetall(queue_to_list) contents = {k.decode('utf-8'): v.decode('utf-8') for k, v in contents_bytes.items()} logger.info(f"--- Contents of Redis Hash '{queue_to_list}' ({len(contents)} fields) ---") item_count = 0 for key, value in contents.items(): # key and value are now strings if item_count >= max_items: logger.info(f" ... (stopped listing after {max_items} items of {hash_size})") break # Attempt to pretty-print if value is JSON try: parsed_value = json.loads(value) pretty_value = json.dumps(parsed_value, indent=2) logger.info(f" '{key}':\n{pretty_value}") except json.JSONDecodeError: logger.info(f" '{key}': {value}") # Print as string if not JSON item_count += 1 logger.info(f"--- End of Hash Contents ---") elif key_type == 'none': logger.info(f"Redis key '{queue_to_list}' does not exist.") else: logger.info(f"Redis key '{queue_to_list}' is of type '{key_type}'. Listing contents for this type is not implemented.") except Exception as e: logger.error(f"Failed to list contents of Redis key '{queue_to_list}': {e}", exc_info=True) raise AirflowException(f"Failed to list Redis key contents: {e}") def check_status_callable(**context): """Checks the status (type and size) of all standard Redis queues for a given base name.""" params = context['params'] redis_conn_id = params['redis_conn_id'] queue_name = params.get('queue_name_for_status', DEFAULT_QUEUE_NAME) queue_suffixes = ['_inbox', '_progress', '_result', '_fail'] logger.info(f"--- Checking Status for Queues with Base Name: '{queue_name}' ---") try: redis_client = _get_redis_client(redis_conn_id) for suffix in queue_suffixes: queue_to_check = f"{queue_name}{suffix}" key_type = redis_client.type(queue_to_check).decode('utf-8') size = 0 if key_type == 'list': size = redis_client.llen(queue_to_check) elif key_type == 'hash': size = redis_client.hlen(queue_to_check) if key_type != 'none': logger.info(f" - Queue '{queue_to_check}': Type='{key_type.upper()}', Size={size}") else: logger.info(f" - Queue '{queue_to_check}': Does not exist.") logger.info(f"--- End of Status Check ---") except Exception as e: logger.error(f"Failed to check queue status for base name '{queue_name}': {e}", exc_info=True) raise AirflowException(f"Failed to check queue status: {e}") def add_videos_to_queue_callable(**context): """ Parses video inputs, normalizes them to URLs, and adds them to a Redis queue. """ params = context["params"] video_inputs = params["video_inputs"] queue_name = params["queue_name"] redis_conn_id = params["redis_conn_id"] dry_run = params["dry_run"] if not video_inputs: logger.info("No video inputs provided. Nothing to do.") print("No video inputs provided. Nothing to do.") return raw_items = parse_video_inputs(video_inputs) if not raw_items: logger.info("Input string was empty or contained no items after parsing.") print("Input string was empty or contained no items after parsing.") return valid_urls = [] for item in raw_items: url = normalize_to_url(item) if url and url not in valid_urls: valid_urls.append(url) elif not url: logger.warning(f"Skipping invalid input item: '{item}'") if not valid_urls: raise AirflowException("No valid YouTube URLs or IDs were found in the provided input.") logger.info(f"Found {len(valid_urls)} valid and unique URLs to add to the queue:") print(f"Found {len(valid_urls)} valid and unique URLs to add to the queue:") for url in valid_urls: logger.info(f" - {url}") print(f" - {url}") if dry_run: logger.info("Dry run is enabled. Skipping Redis operation.") print(f"\n[DRY RUN] Would have added {len(valid_urls)} URLs to the Redis list '{queue_name}_inbox'.") return # --- Add to Redis --- try: redis_client = _get_redis_client(redis_conn_id) inbox_queue = f"{queue_name}_inbox" # Use a pipeline for atomic and efficient addition with redis_client.pipeline() as pipe: for url in valid_urls: pipe.rpush(inbox_queue, url) pipe.execute() final_list_length = redis_client.llen(inbox_queue) success_message = ( f"Successfully added {len(valid_urls)} URLs to Redis list '{inbox_queue}'. " f"The list now contains {final_list_length} items." ) logger.info(success_message) print(f"\n{success_message}") except Exception as e: logger.error(f"Failed to add URLs to Redis queue '{inbox_queue}': {e}", exc_info=True) raise AirflowException(f"Failed to add URLs to Redis: {e}") # --- DAG Definition --- with DAG( dag_id="ytdlp_mgmt_queues", default_args={ "owner": "airflow", "start_date": days_ago(1), "retries": 0, }, schedule=None, catchup=False, tags=["ytdlp", "queue", "management", "redis", "manual"], doc_md=""" ### YT-DLP Queue Management This DAG provides a set of tools to manage Redis queues used by the YTDLP processing pipeline. Select an `action` to perform when triggering the DAG. **Actions:** - `add_videos`: Add one or more YouTube videos to a queue. - `clear_queue`: Dump and/or delete a specific Redis key. - `list_contents`: View the contents of a Redis key (list or hash). - `check_status`: (Placeholder) Check the overall status of the queues. """, params={ "action": Param( "add_videos", type="string", enum=["add_videos", "clear_queue", "list_contents", "check_status"], title="Action", description="The management action to perform.", ), # --- Params for 'add_videos' --- "video_inputs": Param( None, type=["null", "string"], title="[add_videos] Video URLs or IDs", description="A single item, comma-separated list, or JSON array of YouTube URLs or Video IDs.", ), "queue_name": Param( DEFAULT_QUEUE_NAME, type="string", title="[add_videos] Queue Name", description="The base name of the Redis queue to add videos to (e.g., 'video_queue').", ), "dry_run": Param( False, type="boolean", title="[add_videos] Dry Run", description="If True, validate inputs without adding them to the queue.", ), # --- Params for 'clear_queue' --- "queue_to_clear": Param( DEFAULT_QUEUE_TO_CLEAR, type="string", title="[clear_queue] Queue to Clear", description="Exact name of the Redis key to delete.", ), "dump_queues": Param( True, type="boolean", title="[clear_queue] Dump Data", description="If True, dump data before clearing.", ), "dump_dir": Param( "{{ var.value.get('YTDLP_REDIS_DUMP_DIR', '/opt/airflow/dumps') }}", type="string", title="[clear_queue] Dump Directory", description="Base directory to save CSV dump files.", ), "dump_patterns": Param( 'ytdlp:*,video_queue_*', type="string", title="[clear_queue] Dump Patterns", description="Comma-separated list of key patterns to dump.", ), # --- Params for 'list_contents' --- "queue_to_list": Param( 'video_queue_inbox', type="string", title="[list_contents] Queue to List", description="Exact name of the Redis key to list.", ), "max_items": Param( 10, type="integer", title="[list_contents] Max Items to List", description="Maximum number of items to show.", ), # --- Params for 'check_status' --- "queue_name_for_status": Param( DEFAULT_QUEUE_NAME, type="string", title="[check_status] Base Queue Name", description="Base name of the queues to check (e.g., 'video_queue').", ), # --- Common Params --- "redis_conn_id": Param( DEFAULT_REDIS_CONN_ID, type="string", title="Redis Connection ID", ), }, ) as dag: branch_on_action = BranchPythonOperator( task_id="branch_on_action", python_callable=lambda **context: f"action_{context['params']['action']}", ) action_add_videos = PythonOperator( task_id="action_add_videos", python_callable=add_videos_to_queue_callable, ) action_clear_queue = PythonOperator( task_id="action_clear_queue", python_callable=clear_queue_callable, templates_dict={'dump_dir': "{{ params.dump_dir }}"}, ) action_list_contents = PythonOperator( task_id="action_list_contents", python_callable=list_contents_callable, ) action_check_status = PythonOperator( task_id="action_check_status", python_callable=check_status_callable, ) # --- Placeholder Tasks --- branch_on_action >> [action_add_videos, action_clear_queue, action_list_contents, action_check_status]