Migrate to new dags and add proxy banning, sensor / worker in dags
This commit is contained in:
parent
fc2d740b65
commit
61906a57ef
@ -1,24 +1,64 @@
|
|||||||
# Use a base Python image
|
# Use ubuntu:22.04 as the base image
|
||||||
FROM python:3.11-slim
|
FROM ubuntu:22.04
|
||||||
|
|
||||||
# Set working directory
|
# Set working directory
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install necessary system packages for Playwright, GeoIP, and Xvfb
|
# Set timezone and non-interactive frontend for apt
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
libgeoip1 \
|
ARG TZ=Europe/Minsk
|
||||||
# Xvfb for headless browser display
|
ENV TZ=${TZ} LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
xvfb \
|
|
||||||
# Playwright browser dependencies
|
|
||||||
libnss3 libnspr4 libdbus-1-3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install Python dependencies: camoufox with geoip support and playwright==1.49
|
# Install necessary system packages for Playwright, GeoIP, Xvfb, and VNC
|
||||||
# Using --no-cache-dir to reduce image size
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
RUN pip install --no-cache-dir "camoufox[geoip]" playwright==1.49
|
# From user example
|
||||||
|
vim lsof unzip wget ca-certificates \
|
||||||
|
# From existing Dockerfile, kept for completeness
|
||||||
|
libgeoip1 \
|
||||||
|
dbus-x11 \
|
||||||
|
xvfb \
|
||||||
|
xserver-common \
|
||||||
|
xauth \
|
||||||
|
x11-xkb-utils \
|
||||||
|
xfonts-base \
|
||||||
|
procps \
|
||||||
|
libgl1-mesa-dri \
|
||||||
|
x11vnc \
|
||||||
|
fluxbox \
|
||||||
|
libnss3 libnspr4 libdbus-1-3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \
|
||||||
|
libgtk-3-0 libx11-xcb1 fonts-liberation tzdata \
|
||||||
|
xauth util-linux x11-xserver-utils \
|
||||||
|
&& \
|
||||||
|
# Configure timezone
|
||||||
|
ln -fs /usr/share/zoneinfo/${TZ} /etc/localtime && \
|
||||||
|
dpkg-reconfigure -f noninteractive tzdata && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Add build-time argument for VNC password
|
||||||
|
ARG VNC_PASSWORD="vncpassword"
|
||||||
|
|
||||||
|
# Set up VNC password from build argument
|
||||||
|
RUN mkdir -p /root/.vnc && \
|
||||||
|
x11vnc -storepasswd "${VNC_PASSWORD}" /root/.vnc/passwd
|
||||||
|
|
||||||
|
# Install Miniconda
|
||||||
|
RUN wget --no-check-certificate https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \
|
||||||
|
bash /tmp/miniconda.sh -b -p /opt/conda && \
|
||||||
|
rm /tmp/miniconda.sh
|
||||||
|
|
||||||
|
ENV PATH="/opt/conda/bin:$PATH"
|
||||||
|
|
||||||
|
# Create conda environment and configure it
|
||||||
|
RUN conda init bash && \
|
||||||
|
conda config --set always_yes yes && \
|
||||||
|
conda tos accept --override-channels --channel defaults && \
|
||||||
|
conda create -n camo python=3.11 -y
|
||||||
|
|
||||||
|
# Install Python dependencies in conda environment
|
||||||
|
RUN conda run -n camo pip install --no-cache-dir "camoufox[geoip]" playwright==1.49
|
||||||
|
|
||||||
# Install Playwright browsers for version 1.49
|
# Install Playwright browsers for version 1.49
|
||||||
RUN playwright install --with-deps
|
RUN conda run -n camo playwright install --with-deps
|
||||||
|
|
||||||
# Copy the server script into the image
|
# Copy the server script into the image
|
||||||
COPY camoufox_server.py .
|
COPY camoufox_server.py .
|
||||||
@ -32,11 +72,38 @@ COPY youtube_ad_auto_skipper-0.6.0.xpi /app/extensions/
|
|||||||
# Expose the default port Camoufox might use (adjust if needed)
|
# Expose the default port Camoufox might use (adjust if needed)
|
||||||
# This is informational; the actual port mapping is in docker-compose.
|
# This is informational; the actual port mapping is in docker-compose.
|
||||||
EXPOSE 12345
|
EXPOSE 12345
|
||||||
|
# Expose VNC port
|
||||||
|
EXPOSE 5900
|
||||||
|
|
||||||
# Copy the wrapper script and make it executable
|
# Copy the wrapper script and make it executable
|
||||||
COPY start_camoufox.sh /app/
|
COPY start_camoufox.sh /app/
|
||||||
RUN chmod +x /app/start_camoufox.sh
|
RUN chmod +x /app/start_camoufox.sh && \
|
||||||
|
sed -i 's/\r$//' /app/start_camoufox.sh
|
||||||
|
|
||||||
# Default command executes the wrapper script.
|
# Configure Xvfb resolution via build arguments
|
||||||
# Arguments for camoufox_server.py will be passed via docker-compose command section.
|
ARG RESOLUTION="1920x1080x24"
|
||||||
ENTRYPOINT ["/app/start_camoufox.sh"]
|
ENV XVFB_RES="${RESOLUTION}" \
|
||||||
|
DISPLAY=":99" \
|
||||||
|
XAUTHORITY="/tmp/.Xauth"
|
||||||
|
|
||||||
|
# Create Xauth setup (mcookie installed in previous apt-get)
|
||||||
|
RUN touch /tmp/.Xauth && \
|
||||||
|
chmod 644 /tmp/.Xauth && \
|
||||||
|
echo "#!/bin/bash" > /init_x11.sh && \
|
||||||
|
echo "xauth add \$DISPLAY . \$(mcookie)" >> /init_x11.sh && \
|
||||||
|
echo "xhost +local:" >> /init_x11.sh && \
|
||||||
|
chmod +x /init_x11.sh
|
||||||
|
|
||||||
|
# Proper ENTRYPOINT using shell form
|
||||||
|
#ENTRYPOINT ["/bin/bash", "-c", "source /init_x11.sh && exec xvfb-run --auto-servernum --server-args \"-screen 0 ${XVFB_RES} ${XVFB_ARGS}\" /app/start_camoufox.sh"]
|
||||||
|
|
||||||
|
ENTRYPOINT ["/bin/bash", "-c", "\
|
||||||
|
rm -f /tmp/.X99-lock && \
|
||||||
|
Xvfb :99 -screen 0 ${XVFB_RES} -ac & \
|
||||||
|
export DISPLAY=:99 && \
|
||||||
|
sleep 1 && \
|
||||||
|
touch /tmp/.Xauth && \
|
||||||
|
xauth add :99 . $(mcookie) && \
|
||||||
|
xhost +local: && \
|
||||||
|
source /init_x11.sh && \
|
||||||
|
exec /app/start_camoufox.sh \"$@\"", "camoufox-entrypoint"]
|
||||||
|
|||||||
@ -1,58 +1,39 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# Set error handling
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
# Function to cleanup resources on exit
|
# Global PIDs for cleanup
|
||||||
|
VNC_PID=""
|
||||||
|
FLUXBOX_PID=""
|
||||||
|
|
||||||
|
# Cleanup function to terminate background processes on script exit
|
||||||
cleanup() {
|
cleanup() {
|
||||||
echo "Cleaning up resources..."
|
echo "Cleaning up background processes..."
|
||||||
|
# Kill processes in reverse order of startup. The '|| true' prevents errors if a process is already dead.
|
||||||
# Kill Xvfb if it's running
|
if [ -n "$FLUXBOX_PID" ]; then kill -TERM $FLUXBOX_PID 2>/dev/null || true; fi
|
||||||
if [ -n "$XVFB_PID" ] && ps -p $XVFB_PID > /dev/null; then
|
if [ -n "$VNC_PID" ]; then kill -TERM $VNC_PID 2>/dev/null || true; fi
|
||||||
echo "Stopping Xvfb (PID: $XVFB_PID)"
|
echo "Cleanup complete."
|
||||||
kill $XVFB_PID || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Remove X lock files if they exist
|
|
||||||
if [ -e "/tmp/.X99-lock" ]; then
|
|
||||||
echo "Removing X lock file"
|
|
||||||
rm -f /tmp/.X99-lock
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Cleanup complete"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Register the cleanup function to run on script exit
|
|
||||||
trap cleanup EXIT
|
trap cleanup EXIT
|
||||||
|
|
||||||
# Check if X lock file exists and remove it (in case of previous unclean shutdown)
|
# Xvfb is now started by xvfb-run in the Dockerfile ENTRYPOINT.
|
||||||
if [ -e "/tmp/.X99-lock" ]; then
|
# The DISPLAY variable will be set automatically by xvfb-run.
|
||||||
echo "Removing existing X lock file"
|
|
||||||
rm -f /tmp/.X99-lock
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Start Xvfb with display :99
|
|
||||||
echo "Starting Xvfb on display :99"
|
|
||||||
Xvfb :99 -screen 0 1280x1024x24 -ac &
|
|
||||||
XVFB_PID=$!
|
|
||||||
|
|
||||||
# Wait a moment for Xvfb to initialize
|
# It's safer to source conda.sh directly
|
||||||
sleep 2
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
|
conda activate camo
|
||||||
|
|
||||||
# Check if Xvfb started successfully
|
# Start supporting services (VNC, window manager)
|
||||||
if ! ps -p $XVFB_PID > /dev/null; then
|
echo "Starting VNC server on port 5900..."
|
||||||
echo "Failed to start Xvfb"
|
# The -noxdamage flag is added to improve compatibility with VNC clients like the one on macOS.
|
||||||
exit 1
|
# The '-localhost no' part was likely a typo and has been removed as the default is to allow non-localhost connections.
|
||||||
fi
|
x11vnc -forever -usepw -display $DISPLAY -rfbport 5900 -o /var/log/x11vnc.log -shared -noxdamage &
|
||||||
|
VNC_PID=$!
|
||||||
|
|
||||||
# Export the DISPLAY variable for the browser
|
echo "Starting Fluxbox window manager..."
|
||||||
export DISPLAY=:99
|
fluxbox > /var/log/fluxbox.log 2>&1 &
|
||||||
|
FLUXBOX_PID=$!
|
||||||
|
|
||||||
echo "Xvfb started successfully with PID: $XVFB_PID"
|
# Start main application
|
||||||
echo "DISPLAY set to: $DISPLAY"
|
echo "Starting Camoufox server with arguments: $@"
|
||||||
|
exec python3 camoufox_server.py "$@"
|
||||||
# Start the Camoufox server with all arguments passed to this script
|
|
||||||
echo "Starting Camoufox server with arguments:"
|
|
||||||
printf " Arg: '%s'\n" "$@" # Print each argument quoted on a new line
|
|
||||||
echo "Executing: python3 camoufox_server.py $@"
|
|
||||||
python3 camoufox_server.py "$@"
|
|
||||||
|
|||||||
46
dags/README.ru.md
Normal file
46
dags/README.ru.md
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
# Архитектура и описание YTDLP Airflow DAGs
|
||||||
|
|
||||||
|
Этот документ описывает архитектуру и назначение DAG'ов, используемых для скачивания видео с YouTube. Система построена по паттерну "Сенсор/Воркер" для обеспечения непрерывной и параллельной обработки.
|
||||||
|
|
||||||
|
## Основной цикл обработки
|
||||||
|
|
||||||
|
### `ytdlp_sensor_redis_queue` (Сенсор)
|
||||||
|
|
||||||
|
- **Назначение:** Забирает URL на скачивание из очереди Redis и запускает воркеры для их обработки.
|
||||||
|
- **Принцип работы (Запуск по триггеру):**
|
||||||
|
- **По триггеру:** Когда воркер `ytdlp_worker_per_url` успешно завершает работу, он немедленно запускает сенсор. Это обеспечивает непрерывную обработку без задержек. Запуск по расписанию отключен, чтобы избежать повторного запуска задач для заблокированных аккаунтов.
|
||||||
|
- **Логика:** Извлекает из Redis (`_inbox` лист) пачку URL. Если очередь пуста, DAG успешно завершается до следующего запуска по триггеру.
|
||||||
|
|
||||||
|
### `ytdlp_worker_per_url` (Воркер)
|
||||||
|
|
||||||
|
- **Назначение:** Обрабатывает один URL, скачивает видео и продолжает цикл.
|
||||||
|
- **Принцип работы:**
|
||||||
|
- Получает один URL от сенсора.
|
||||||
|
- Обращается к сервису `ytdlp-ops-auth` для получения `info.json` и `socks5` прокси.
|
||||||
|
- Скачивает видео, используя полученные данные. (TODO: заменить вызов `yt-dlp` как команды на вызов библиотеки).
|
||||||
|
- В зависимости от статуса (успех/неуспех), помещает результат в соответствующий хэш Redis (`_result` или `_fail`).
|
||||||
|
- В случае успеха, повторно запускает сенсор `ytdlp_sensor_redis_queue` для продолжения цикла обработки. В случае ошибки цикл останавливается для ручной диагностики.
|
||||||
|
|
||||||
|
## Управляющие DAG'и
|
||||||
|
|
||||||
|
Эти DAG'и предназначены для ручного управления очередями и не участвуют в автоматическом цикле.
|
||||||
|
|
||||||
|
- **`ytdlp_mgmt_queue_add_and_verify`**: Добавление URL в очередь задач (`_inbox`) и последующая проверка статуса этой очереди.
|
||||||
|
- **`ytdlp_mgmt_queues_check_status`**: Просмотр состояния и содержимого всех ключевых очередей (`_inbox`, `_progress`, `_result`, `_fail`). Помогает отслеживать процесс обработки.
|
||||||
|
- **`ytdlp_mgmt_queue_clear`**: Очистка (полное удаление) указанной очереди Redis. **Использовать с осторожностью**, так как операция необратима.
|
||||||
|
|
||||||
|
## Внешние сервисы
|
||||||
|
|
||||||
|
### `ytdlp-ops-auth` (Thrift Service)
|
||||||
|
|
||||||
|
- **Назначение:** Внешний сервис, который предоставляет аутентификационные данные (токены, cookies, proxy) для скачивания видео.
|
||||||
|
- **Взаимодействие:** Worker DAG (`ytdlp_worker_per_url`) обращается к этому сервису перед началом загрузки для получения необходимых данных для `yt-dlp`.
|
||||||
|
|
||||||
|
## TODO (Планы на доработку)
|
||||||
|
|
||||||
|
- **Реализовать механизм "Circuit Breaker" (автоматического выключателя):**
|
||||||
|
- **Проблема:** Если воркер падает с ошибкой (например, из-за бана аккаунта), сенсор, запускаемый по расписанию, продолжает создавать новые задачи для этого же аккаунта, усугубляя проблему.
|
||||||
|
- **Решение:**
|
||||||
|
1. **Воркер (`ytdlp_worker_per_url`):** При сбое задачи, воркер должен устанавливать в Redis флаг временной блокировки для своего `account_id` (например, на 5-10 минут).
|
||||||
|
2. **Сенсор (`ytdlp_sensor_redis_queue`):** Перед проверкой очереди, сенсор должен проверять наличие флага блокировки для своего `account_id`. Если аккаунт заблокирован, сенсор должен пропустить выполнение, предотвращая запуск новых воркеров для проблемного аккаунта.
|
||||||
|
- **Результат:** Это предотвратит многократные повторные запросы к заблокированному аккаунту и даст системе время на восстановление.
|
||||||
@ -1,941 +0,0 @@
|
|||||||
from airflow import DAG
|
|
||||||
from airflow.models import BaseOperator, Variable
|
|
||||||
from airflow.utils.decorators import apply_defaults
|
|
||||||
from airflow.hooks.base import BaseHook
|
|
||||||
from airflow.exceptions import AirflowException
|
|
||||||
from airflow.utils.dates import days_ago
|
|
||||||
from thrift.transport import TSocket, TTransport
|
|
||||||
from thrift.protocol import TBinaryProtocol
|
|
||||||
from thrift.transport.TTransport import TTransportException
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
from pangramia.yt.exceptions.ttypes import PBServiceException
|
|
||||||
import redis
|
|
||||||
import logging
|
|
||||||
import time
|
|
||||||
import socket
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from pangramia.yt.tokens_ops import YTTokenOpService
|
|
||||||
from pangramia.yt.common.ttypes import TokenUpdateMode
|
|
||||||
from airflow.providers.redis.hooks.redis import RedisHook
|
|
||||||
from airflow.operators.python import PythonOperator
|
|
||||||
from airflow.models.param import Param
|
|
||||||
# Assuming ytdlp_utils exists in the same directory or PYTHONPATH
|
|
||||||
# from ytdlp_utils import get_info_json, is_valid_json, extract_video_id
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Default settings (similar to ytdlp_client_dag.py)
|
|
||||||
MAX_RETRIES = 1
|
|
||||||
RETRY_DELAY = timedelta(seconds=10)
|
|
||||||
DEFAULT_TIMEOUT = 30
|
|
||||||
|
|
||||||
class YtdlpOpsOperator(BaseOperator):
|
|
||||||
"""
|
|
||||||
Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections
|
|
||||||
and Redis-based discovery, retrieves tokens, saves info.json, and manages errors.
|
|
||||||
"""
|
|
||||||
template_fields = ('url', 'service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir')
|
|
||||||
|
|
||||||
@apply_defaults
|
|
||||||
def __init__(self, url, redis_conn_id='redis_default', max_retries=3, retry_delay=10,
|
|
||||||
service_ip=None, service_port=None, redis_enabled=False, account_id=None,
|
|
||||||
save_info_json=True, info_json_dir=None, get_socks_proxy=True,
|
|
||||||
store_socks_proxy=False, timeout=DEFAULT_TIMEOUT, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
logger.info(f"Initializing YtdlpOpsOperator with parameters: url={url}, "
|
|
||||||
f"redis_conn_id={redis_conn_id}, max_retries={max_retries}, retry_delay={retry_delay}, "
|
|
||||||
f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, "
|
|
||||||
f"account_id={account_id}, save_info_json={save_info_json}, info_json_dir={info_json_dir}, "
|
|
||||||
f"get_socks_proxy={get_socks_proxy}, store_socks_proxy={store_socks_proxy}, timeout={timeout}")
|
|
||||||
|
|
||||||
# Validate required parameters
|
|
||||||
if not url:
|
|
||||||
raise ValueError("url is required")
|
|
||||||
|
|
||||||
# Validate parameters based on connection mode
|
|
||||||
if redis_enabled:
|
|
||||||
if not account_id:
|
|
||||||
raise ValueError("account_id is required when redis_enabled=True")
|
|
||||||
# Use default Redis connection if not specified
|
|
||||||
if not redis_conn_id:
|
|
||||||
redis_conn_id = 'redis_default'
|
|
||||||
logger.info(f"Using default Redis connection ID: {redis_conn_id}")
|
|
||||||
else:
|
|
||||||
if not service_ip or not service_port:
|
|
||||||
raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False")
|
|
||||||
if not account_id:
|
|
||||||
logger.warning("No account_id provided for direct connection mode. Using 'default'")
|
|
||||||
account_id = 'default' # Assign default if missing in direct mode
|
|
||||||
|
|
||||||
self.url = url
|
|
||||||
self.redis_conn_id = redis_conn_id
|
|
||||||
self.max_retries = max_retries
|
|
||||||
self.retry_delay = int(retry_delay.total_seconds() if isinstance(retry_delay, timedelta) else retry_delay)
|
|
||||||
self.service_ip = service_ip
|
|
||||||
self.service_port = service_port
|
|
||||||
self.redis_enabled = redis_enabled
|
|
||||||
self.account_id = account_id
|
|
||||||
self.save_info_json = save_info_json
|
|
||||||
self.info_json_dir = info_json_dir
|
|
||||||
self.get_socks_proxy = get_socks_proxy
|
|
||||||
self.store_socks_proxy = store_socks_proxy
|
|
||||||
self.timeout = timeout
|
|
||||||
|
|
||||||
def execute(self, context):
|
|
||||||
logger.info("Executing YtdlpOpsOperator")
|
|
||||||
transport = None
|
|
||||||
try:
|
|
||||||
logger.info("Getting task parameters")
|
|
||||||
params = context.get('params', {})
|
|
||||||
redis_enabled = params.get('redis_enabled', self.redis_enabled)
|
|
||||||
logger.info(f"Using redis_enabled={redis_enabled} (from {'task params' if 'redis_enabled' in params else 'operator init'})")
|
|
||||||
|
|
||||||
# Determine account_id to use (from params or operator default)
|
|
||||||
account_id = context['params'].get('account_id', self.account_id)
|
|
||||||
logger.info(f"Using account_id='{account_id}' (from {'task params' if 'account_id' in params else 'operator init'})")
|
|
||||||
|
|
||||||
if redis_enabled:
|
|
||||||
# Get Redis connection with proper authentication and error handling
|
|
||||||
redis_conn = BaseHook.get_connection(self.redis_conn_id)
|
|
||||||
redis_client = redis.Redis(
|
|
||||||
host=redis_conn.host,
|
|
||||||
port=redis_conn.port,
|
|
||||||
password=redis_conn.password,
|
|
||||||
db=0,
|
|
||||||
decode_responses=True # Important for consistent key handling
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test Redis connection
|
|
||||||
try:
|
|
||||||
if not redis_client.ping():
|
|
||||||
raise redis.exceptions.ConnectionError("Redis ping failed")
|
|
||||||
logger.info(f"Successfully connected to Redis at {redis_conn.host}:{redis_conn.port}")
|
|
||||||
except redis.exceptions.AuthenticationError:
|
|
||||||
logger.error(f"Redis authentication failed for connection '{self.redis_conn_id}'. Check password.")
|
|
||||||
raise AirflowException("Redis authentication failed.")
|
|
||||||
except redis.exceptions.ConnectionError as e:
|
|
||||||
logger.error(f"Could not connect to Redis at {redis_conn.host}:{redis_conn.port}. Error: {e}")
|
|
||||||
raise AirflowException(f"Redis connection failed: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected Redis error: {str(e)}")
|
|
||||||
raise AirflowException(f"Unexpected Redis error: {e}")
|
|
||||||
|
|
||||||
# Get service details from Redis with retries and proper key handling
|
|
||||||
service_key = f"ytdlp:{account_id}"
|
|
||||||
legacy_key = account_id # For backward compatibility
|
|
||||||
|
|
||||||
host = None
|
|
||||||
port = None
|
|
||||||
for attempt in range(self.max_retries):
|
|
||||||
try:
|
|
||||||
logger.info(f"Attempt {attempt + 1}/{self.max_retries}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'")
|
|
||||||
service_details = redis_client.hgetall(service_key)
|
|
||||||
if not service_details:
|
|
||||||
logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'")
|
|
||||||
service_details = redis_client.hgetall(legacy_key)
|
|
||||||
|
|
||||||
if not service_details:
|
|
||||||
raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}")
|
|
||||||
|
|
||||||
# Find IP and port, handling potential case differences and byte/string types
|
|
||||||
ip_key = next((k for k in service_details if k.lower() == 'ip'), None)
|
|
||||||
port_key = next((k for k in service_details if k.lower() == 'port'), None)
|
|
||||||
|
|
||||||
if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}")
|
|
||||||
if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}")
|
|
||||||
|
|
||||||
host = service_details[ip_key] # Already decoded due to decode_responses=True
|
|
||||||
port_str = service_details[port_key]
|
|
||||||
|
|
||||||
try:
|
|
||||||
port = int(port_str)
|
|
||||||
except ValueError:
|
|
||||||
raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}")
|
|
||||||
|
|
||||||
logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}")
|
|
||||||
break # Success
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}")
|
|
||||||
if attempt == self.max_retries - 1:
|
|
||||||
logger.error("Max retries reached for fetching Redis details.")
|
|
||||||
raise AirflowException(f"Failed to get service details from Redis after {self.max_retries} attempts: {e}")
|
|
||||||
logger.info(f"Retrying in {self.retry_delay} seconds...")
|
|
||||||
time.sleep(self.retry_delay)
|
|
||||||
else:
|
|
||||||
# Direct connection: Log parameter sources
|
|
||||||
params = context.get('params', {})
|
|
||||||
host = params.get('service_ip', self.service_ip)
|
|
||||||
host_source = 'task params' if 'service_ip' in params else 'operator init'
|
|
||||||
port_str = params.get('service_port', self.service_port)
|
|
||||||
port_source = 'task params' if 'service_port' in params else 'operator init'
|
|
||||||
url = params.get('url', self.url)
|
|
||||||
url_source = 'task params' if 'url' in params else 'operator init'
|
|
||||||
|
|
||||||
logger.info(f"Using service_ip={host} (from {host_source})")
|
|
||||||
logger.info(f"Using service_port={port_str} (from {port_source})")
|
|
||||||
logger.info(f"Using url={url} (from {url_source})")
|
|
||||||
|
|
||||||
if not host or not port_str:
|
|
||||||
raise ValueError("Direct connection requires service_ip and service_port")
|
|
||||||
try:
|
|
||||||
port = int(port_str)
|
|
||||||
except ValueError:
|
|
||||||
raise ValueError(f"Invalid service_port value: {port_str}")
|
|
||||||
|
|
||||||
logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)")
|
|
||||||
|
|
||||||
# Render and validate timeout
|
|
||||||
timeout_param = context.get('params', {}).get('timeout', self.timeout)
|
|
||||||
if isinstance(self.timeout, str) and '{{' in self.timeout:
|
|
||||||
timeout_rendered = self.render_template(self.timeout, context)
|
|
||||||
logger.info(f"Rendered timeout template: '{self.timeout}' -> '{timeout_rendered}'")
|
|
||||||
timeout_param = timeout_rendered
|
|
||||||
try:
|
|
||||||
timeout = int(timeout_param)
|
|
||||||
if timeout <= 0: raise ValueError("Timeout must be positive")
|
|
||||||
logger.info(f"Using timeout: {timeout} seconds")
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
logger.warning(f"Invalid timeout value: '{timeout_param}'. Using default: {DEFAULT_TIMEOUT}")
|
|
||||||
timeout = DEFAULT_TIMEOUT
|
|
||||||
|
|
||||||
# Create Thrift connection objects
|
|
||||||
socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4)
|
|
||||||
socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds
|
|
||||||
transport = TTransport.TFramedTransport(socket_conn)
|
|
||||||
protocol = TBinaryProtocol.TBinaryProtocol(transport)
|
|
||||||
client = YTTokenOpService.Client(protocol)
|
|
||||||
|
|
||||||
logger.info(f"Attempting to connect to Thrift server at {host}:{port}...")
|
|
||||||
try:
|
|
||||||
transport.open()
|
|
||||||
logger.info("Successfully connected to Thrift server.")
|
|
||||||
|
|
||||||
# Test connection with ping
|
|
||||||
try:
|
|
||||||
client.ping()
|
|
||||||
logger.info("Server ping successful.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Server ping failed: {e}")
|
|
||||||
raise AirflowException(f"Server connection test (ping) failed: {e}")
|
|
||||||
|
|
||||||
# Get token from service with specific error handling
|
|
||||||
try:
|
|
||||||
url_param = context.get('params', {}).get('url', self.url)
|
|
||||||
logger.info(f"Requesting token for accountId='{account_id}', url='{url_param}'")
|
|
||||||
token_data = client.getOrRefreshToken(
|
|
||||||
accountId=account_id,
|
|
||||||
updateType=TokenUpdateMode.AUTO,
|
|
||||||
url=url_param
|
|
||||||
)
|
|
||||||
logger.info("Successfully retrieved token data from service.")
|
|
||||||
except PBServiceException as e:
|
|
||||||
logger.error(f"PBServiceException occurred: Code={getattr(e, 'errorCode', 'N/A')}, Message={getattr(e, 'message', 'N/A')}")
|
|
||||||
error_code = getattr(e, 'errorCode', None)
|
|
||||||
error_msg = f"YTDLP service error: {getattr(e, 'message', str(e))}"
|
|
||||||
# Handle specific known error codes
|
|
||||||
if error_code in [
|
|
||||||
"SOCKS5_CONNECTION_FAILED", "SOCKS5_TIMEOUT",
|
|
||||||
"SOCKS5_CONNECTION_REFUSED", "SOCKS5_CONNECTION_TIMEOUT",
|
|
||||||
"SOCKS5_HOST_NOT_FOUND", "SOCKS5_NETWORK_UNREACHABLE"
|
|
||||||
]:
|
|
||||||
error_msg = f"SOCKS5 proxy error ({error_code}): {e.message}. Check proxy settings."
|
|
||||||
elif error_code == "BOT_DETECTION":
|
|
||||||
error_msg = f"Bot detection triggered ({error_code}): {e.message}."
|
|
||||||
suggestions = getattr(e, 'context', {}).get('suggestions', [])
|
|
||||||
if suggestions: error_msg += "\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions)
|
|
||||||
elif error_code == "NODEJS_SCRIPT_ERROR":
|
|
||||||
error_msg = f"Node.js script error ({error_code}): {e.message}."
|
|
||||||
elif error_code == "NODEJS_TIMEOUT":
|
|
||||||
error_msg = f"Node.js timeout ({error_code}): {e.message}."
|
|
||||||
# Add more specific error handling as needed
|
|
||||||
raise AirflowException(error_msg)
|
|
||||||
except TTransportException as e:
|
|
||||||
logger.error(f"Thrift transport error during getOrRefreshToken: {e}")
|
|
||||||
raise AirflowException(f"Transport error during API call: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error during getOrRefreshToken: {e}")
|
|
||||||
raise AirflowException(f"Unexpected error during API call: {e}")
|
|
||||||
|
|
||||||
except TTransportException as e:
|
|
||||||
# Handle connection-specific transport errors
|
|
||||||
if "read 0 bytes" in str(e) or "Could not connect to" in str(e) or "Connection refused" in str(e):
|
|
||||||
logger.error(f"Connection failed to {host}:{port}. Details: {e}")
|
|
||||||
logger.error("Possible causes: Server down, firewall block, incorrect IP/port.")
|
|
||||||
raise AirflowException(f"Failed to connect to YTDLP service at {host}:{port}: {e}")
|
|
||||||
else:
|
|
||||||
logger.error(f"Thrift transport error during connection: {str(e)}")
|
|
||||||
raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error during connection or ping: {str(e)}")
|
|
||||||
raise # Re-raise other unexpected errors
|
|
||||||
|
|
||||||
# Log received token data attributes for debugging
|
|
||||||
logger.debug(f"Token data received. Attributes: {dir(token_data)}")
|
|
||||||
for attr in dir(token_data):
|
|
||||||
if not attr.startswith('__') and not callable(getattr(token_data, attr)): # Log non-callable attributes
|
|
||||||
value = getattr(token_data, attr)
|
|
||||||
if attr == 'infoJson' and value:
|
|
||||||
logger.debug(f"infoJson: {value[:50]}...")
|
|
||||||
else:
|
|
||||||
logger.debug(f"{attr}: {value}")
|
|
||||||
|
|
||||||
info_json_path = None # Initialize info_json_path
|
|
||||||
|
|
||||||
save_info_json_param = context['params'].get('save_info_json', self.save_info_json)
|
|
||||||
# Render if it's a string template
|
|
||||||
if isinstance(save_info_json_param, str):
|
|
||||||
save_info_json_rendered = self.render_template(save_info_json_param, context)
|
|
||||||
# Convert common string representations to boolean
|
|
||||||
save_info_json = str(save_info_json_rendered).lower() in ['true', '1', 't', 'y', 'yes']
|
|
||||||
else:
|
|
||||||
save_info_json = bool(save_info_json_param)
|
|
||||||
|
|
||||||
|
|
||||||
# Save info.json if requested and valid
|
|
||||||
if self.save_info_json:
|
|
||||||
info_json = self._get_info_json(token_data)
|
|
||||||
if info_json and self._is_valid_json(info_json):
|
|
||||||
try:
|
|
||||||
# Use internal _save_info_json method which handles rendering, dir creation, logging
|
|
||||||
info_json_path = self._save_info_json(context, info_json)
|
|
||||||
if info_json_path: # Check if saving was successful
|
|
||||||
context['task_instance'].xcom_push(key='info_json_path', value=info_json_path)
|
|
||||||
logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}")
|
|
||||||
else:
|
|
||||||
# _save_info_json should log errors, push None to indicate failure
|
|
||||||
context['task_instance'].xcom_push(key='info_json_path', value=None)
|
|
||||||
logger.warning("info.json saving failed (check logs from _save_info_json), pushing None to XCom for info_json_path.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True)
|
|
||||||
context['task_instance'].xcom_push(key='info_json_path', value=None) # Push None on error
|
|
||||||
elif info_json:
|
|
||||||
logger.warning("Retrieved infoJson is not valid JSON. Skipping save.")
|
|
||||||
context['task_instance'].xcom_push(key='info_json_path', value=None)
|
|
||||||
else:
|
|
||||||
logger.info("No infoJson found in token data. Skipping save.")
|
|
||||||
context['task_instance'].xcom_push(key='info_json_path', value=None)
|
|
||||||
else:
|
|
||||||
logger.info("save_info_json is False. Skipping info.json save.")
|
|
||||||
context['task_instance'].xcom_push(key='info_json_path', value=None)
|
|
||||||
|
|
||||||
|
|
||||||
# Extract and potentially store SOCKS proxy
|
|
||||||
socks_proxy = None
|
|
||||||
if self.get_socks_proxy: # Use instance attribute
|
|
||||||
# Check for common attribute names for proxy
|
|
||||||
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
|
|
||||||
if proxy_attr:
|
|
||||||
socks_proxy = getattr(token_data, proxy_attr)
|
|
||||||
if socks_proxy: # Ensure proxy value is not empty
|
|
||||||
logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
|
|
||||||
if self.store_socks_proxy: # Use instance attribute
|
|
||||||
context['task_instance'].xcom_push(key='socks_proxy', value=socks_proxy)
|
|
||||||
logger.info(f"Pushed key 'socks_proxy' to XCom with value: {socks_proxy}")
|
|
||||||
else:
|
|
||||||
logger.info("SOCKS proxy extracted but not pushed to XCom (store_socks_proxy=False).")
|
|
||||||
else:
|
|
||||||
logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty. No proxy extracted.")
|
|
||||||
# Push None even if found but empty, if storing is enabled
|
|
||||||
if self.store_socks_proxy: # Use instance attribute
|
|
||||||
context['task_instance'].xcom_push(key='socks_proxy', value=None)
|
|
||||||
logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.")
|
|
||||||
else:
|
|
||||||
logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found in token data.")
|
|
||||||
# Push None if storing is enabled but attribute not found
|
|
||||||
if self.store_socks_proxy: # Use instance attribute
|
|
||||||
context['task_instance'].xcom_push(key='socks_proxy', value=None)
|
|
||||||
logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.")
|
|
||||||
else:
|
|
||||||
logger.info("get_socks_proxy is False. Skipping proxy extraction.")
|
|
||||||
# Push None if storing is enabled but extraction was skipped
|
|
||||||
if self.store_socks_proxy: # Use instance attribute
|
|
||||||
context['task_instance'].xcom_push(key='socks_proxy', value=None)
|
|
||||||
logger.info("Pushed None to XCom for 'socks_proxy' as get_socks_proxy=False.")
|
|
||||||
|
|
||||||
|
|
||||||
# Get the original command from the server
|
|
||||||
ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
|
|
||||||
if not ytdlp_cmd:
|
|
||||||
logger.error("No 'ytdlpCommand' attribute found in token data.")
|
|
||||||
raise AirflowException("Required 'ytdlpCommand' not received from service.")
|
|
||||||
|
|
||||||
logger.info(f"Original command received from server: {ytdlp_cmd}")
|
|
||||||
|
|
||||||
# Log example usage command (DO NOT MODIFY the original command here)
|
|
||||||
if info_json_path:
|
|
||||||
# Use double quotes for paths/proxy in example for robustness
|
|
||||||
example_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
|
|
||||||
if socks_proxy:
|
|
||||||
example_cmd += f" --proxy \"{socks_proxy}\""
|
|
||||||
example_cmd += " --verbose --simulate" # Add useful flags for testing
|
|
||||||
logger.info(f"\n--- Example usage with saved info.json ---")
|
|
||||||
logger.info(example_cmd)
|
|
||||||
logger.info(f"(Note: The actual command with tokens/cookies is pushed to XCom as 'ytdlp_command')")
|
|
||||||
latest_json_path = os.path.join(os.path.dirname(info_json_path), 'latest.json')
|
|
||||||
logger.info(f"(You can also use 'latest.json': {latest_json_path})")
|
|
||||||
logger.info(f"-------------------------------------------\n")
|
|
||||||
|
|
||||||
else:
|
|
||||||
logger.info("\n--- Original command pushed to XCom ('ytdlp_command') ---")
|
|
||||||
if socks_proxy:
|
|
||||||
logger.info(f"Use the extracted proxy '{socks_proxy}' (pushed to XCom if store_socks_proxy=True) with the --proxy flag.")
|
|
||||||
logger.info("Add --verbose and --simulate flags for testing the command.")
|
|
||||||
logger.info(f"-------------------------------------------------------\n")
|
|
||||||
|
|
||||||
|
|
||||||
# Push the *original* command to XCom
|
|
||||||
context['task_instance'].xcom_push(key='ytdlp_command', value=ytdlp_cmd)
|
|
||||||
logger.info(f"Pushed original command to XCom key 'ytdlp_command'.")
|
|
||||||
|
|
||||||
# Note: Returning ytdlp_cmd below implicitly pushes the same value
|
|
||||||
# to XCom under the key 'return_value'. Downstream tasks should
|
|
||||||
# preferably use the explicitly pushed 'ytdlp_command' key for clarity.
|
|
||||||
return ytdlp_cmd # Return the original command
|
|
||||||
|
|
||||||
except AirflowException as e: # Catch AirflowExceptions raised explicitly in the code above
|
|
||||||
logger.error(f"Operation failed due to AirflowException: {e}")
|
|
||||||
raise # Re-raise AirflowExceptions to ensure task failure
|
|
||||||
except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already wrapped
|
|
||||||
logger.error(f"Unhandled Thrift/Service error: {e}", exc_info=True) # Add traceback for context
|
|
||||||
raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException
|
|
||||||
except Exception as e: # General catch-all for truly unexpected errors
|
|
||||||
# Log with traceback for unexpected errors
|
|
||||||
logger.error(f"Caught unexpected error in YtdlpOpsOperator: {e}", exc_info=True)
|
|
||||||
# Ensure any unexpected error explicitly fails the task with AirflowException
|
|
||||||
raise AirflowException(f"Unexpected error caused task failure: {e}")
|
|
||||||
finally:
|
|
||||||
if transport and transport.isOpen(): # Check if transport exists and is open before closing
|
|
||||||
logger.info("Closing Thrift transport.")
|
|
||||||
transport.close()
|
|
||||||
|
|
||||||
# --- Helper Methods ---
|
|
||||||
|
|
||||||
def _get_info_json(self, token_data):
|
|
||||||
"""Safely extracts infoJson from token data."""
|
|
||||||
info_json = getattr(token_data, 'infoJson', None)
|
|
||||||
if info_json:
|
|
||||||
logger.debug("Extracted infoJson from token data.")
|
|
||||||
else:
|
|
||||||
logger.debug("No infoJson attribute found in token data.")
|
|
||||||
return info_json
|
|
||||||
|
|
||||||
def _is_valid_json(self, json_str):
|
|
||||||
"""Checks if a string is valid JSON."""
|
|
||||||
if not json_str or not isinstance(json_str, str):
|
|
||||||
logger.debug("Input is not a non-empty string, considered invalid JSON.")
|
|
||||||
return False
|
|
||||||
try:
|
|
||||||
json.loads(json_str)
|
|
||||||
logger.debug("JSON string validation successful.")
|
|
||||||
return True
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.warning(f"JSON validation failed: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _save_info_json(self, context, info_json):
|
|
||||||
"""Saves info_json to a file, handling directory creation and logging. Returns the path on success, None on failure."""
|
|
||||||
try:
|
|
||||||
# Get URL from params/context for video ID extraction
|
|
||||||
url_param = context.get('params', {}).get('url', self.url)
|
|
||||||
video_id = self._extract_video_id(url_param) # Use internal helper
|
|
||||||
|
|
||||||
# Render the info_json_dir template
|
|
||||||
save_dir_template = self.info_json_dir or "." # Default to current dir if template is None or empty string
|
|
||||||
save_dir = self.render_template(save_dir_template, context)
|
|
||||||
if not save_dir: # Handle case where template renders to empty string
|
|
||||||
logger.warning(f"Rendered info_json_dir template '{save_dir_template}' resulted in an empty path. Defaulting to '.'")
|
|
||||||
save_dir = "."
|
|
||||||
logger.info(f"Target directory for info.json (rendered): {save_dir}")
|
|
||||||
|
|
||||||
# Ensure directory exists
|
|
||||||
try:
|
|
||||||
os.makedirs(save_dir, exist_ok=True)
|
|
||||||
logger.info(f"Ensured directory exists: {save_dir}")
|
|
||||||
except OSError as e:
|
|
||||||
logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.")
|
|
||||||
return None # Indicate failure
|
|
||||||
|
|
||||||
# Construct filename (using potentially overridden account_id)
|
|
||||||
account_id_param = context.get('params', {}).get('account_id', self.account_id)
|
|
||||||
timestamp = int(time.time())
|
|
||||||
base_filename = f"info_{video_id}_{account_id_param}_{timestamp}.json" if video_id else f"info_{account_id_param}_{timestamp}.json"
|
|
||||||
info_json_path = os.path.join(save_dir, base_filename)
|
|
||||||
latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy
|
|
||||||
|
|
||||||
# Write to timestamped file
|
|
||||||
try:
|
|
||||||
logger.info(f"Writing info.json content (received from service) to {info_json_path}...")
|
|
||||||
with open(info_json_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(info_json)
|
|
||||||
logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}")
|
|
||||||
except IOError as e:
|
|
||||||
logger.error(f"Failed to write info.json to {info_json_path}: {e}")
|
|
||||||
return None # Indicate failure
|
|
||||||
|
|
||||||
# Write to latest.json (overwrite) - best effort
|
|
||||||
try:
|
|
||||||
with open(latest_json_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(info_json)
|
|
||||||
logger.info(f"Updated latest.json file: {latest_json_path}")
|
|
||||||
except IOError as e:
|
|
||||||
# Log warning but don't fail the whole save if only latest.json fails
|
|
||||||
logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}")
|
|
||||||
|
|
||||||
return info_json_path # Return path on success (even if latest.json failed)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True)
|
|
||||||
return None # Indicate failure
|
|
||||||
|
|
||||||
def _extract_video_id(self, url):
|
|
||||||
"""Extracts YouTube video ID from URL (internal helper)."""
|
|
||||||
if not url or not isinstance(url, str):
|
|
||||||
logger.debug("URL is empty or not a string, cannot extract video ID.")
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
# Basic extraction logic (can be enhanced for more URL types)
|
|
||||||
video_id = None
|
|
||||||
if 'youtube.com/watch?v=' in url:
|
|
||||||
video_id = url.split('v=')[1].split('&')[0]
|
|
||||||
elif 'youtu.be/' in url:
|
|
||||||
video_id = url.split('youtu.be/')[1].split('?')[0]
|
|
||||||
|
|
||||||
# Ensure it looks like a video ID (typically 11 chars, but can vary)
|
|
||||||
if video_id and len(video_id) >= 11:
|
|
||||||
video_id = video_id[:11] # Take first 11 chars as standard ID length
|
|
||||||
logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
|
|
||||||
return video_id
|
|
||||||
else:
|
|
||||||
logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Python Callables for Tasks
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
def display_token_info(**context):
|
|
||||||
"""Displays token info from XCom, parses info.json, and logs example commands."""
|
|
||||||
ti = context['task_instance']
|
|
||||||
logger.info("Starting display_token_info task.")
|
|
||||||
|
|
||||||
# Pull data from XCom (provide default values)
|
|
||||||
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
|
|
||||||
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
|
|
||||||
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
|
|
||||||
|
|
||||||
logger.info("\n=== Pulled Token Information from XCom ===")
|
|
||||||
logger.info(f"Info.json path: {info_json_path or 'Not found/Not saved'}")
|
|
||||||
logger.info(f"SOCKS Proxy: {socks_proxy or 'Not found/Not extracted'}")
|
|
||||||
logger.info(f"Original yt-dlp command (with tokens): {ytdlp_command or 'Not found'}")
|
|
||||||
|
|
||||||
result = {
|
|
||||||
'info_path': info_json_path,
|
|
||||||
'proxy': socks_proxy,
|
|
||||||
'ytdlp_command': ytdlp_command,
|
|
||||||
'video_info': None,
|
|
||||||
'commands': {},
|
|
||||||
'error': None
|
|
||||||
}
|
|
||||||
|
|
||||||
if info_json_path and os.path.exists(info_json_path):
|
|
||||||
logger.info(f"\n=== Processing Video Information from: {info_json_path} ===")
|
|
||||||
try:
|
|
||||||
with open(info_json_path, 'r', encoding='utf-8') as f:
|
|
||||||
info = json.load(f)
|
|
||||||
|
|
||||||
# Extract and log basic video info safely
|
|
||||||
title = info.get('title', 'Unknown Title')
|
|
||||||
uploader = info.get('uploader', 'Unknown Author')
|
|
||||||
duration = info.get('duration_string', 'Unknown Length')
|
|
||||||
upload_date_str = info.get('upload_date') # Format: YYYYMMDD
|
|
||||||
upload_date_formatted = 'Unknown Date'
|
|
||||||
if upload_date_str:
|
|
||||||
try:
|
|
||||||
# Validate format before parsing
|
|
||||||
if len(upload_date_str) == 8 and upload_date_str.isdigit():
|
|
||||||
upload_date_formatted = datetime.strptime(upload_date_str, '%Y%m%d').strftime('%Y-%m-%d')
|
|
||||||
else:
|
|
||||||
logger.warning(f"Upload date '{upload_date_str}' is not in YYYYMMDD format.")
|
|
||||||
except ValueError:
|
|
||||||
logger.warning(f"Could not parse upload_date '{upload_date_str}'")
|
|
||||||
|
|
||||||
result['video_info'] = {
|
|
||||||
'title': title,
|
|
||||||
'uploader': uploader,
|
|
||||||
'upload_date': upload_date_formatted, # Store formatted date
|
|
||||||
'duration': duration
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info(f"Title: {title}")
|
|
||||||
logger.info(f"Author: {uploader}")
|
|
||||||
logger.info(f"Date: {upload_date_formatted}")
|
|
||||||
logger.info(f"Length: {duration}")
|
|
||||||
|
|
||||||
logger.info("\n=== Example yt-dlp Commands (using saved info.json) ===")
|
|
||||||
base_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
|
|
||||||
if socks_proxy:
|
|
||||||
base_cmd += f" --proxy \"{socks_proxy}\""
|
|
||||||
|
|
||||||
# Command to list formats
|
|
||||||
format_cmd = f"{base_cmd} -F"
|
|
||||||
result['commands']['format'] = format_cmd
|
|
||||||
logger.info(f"List formats command: {format_cmd}")
|
|
||||||
|
|
||||||
# Execute and log the format listing command
|
|
||||||
logger.info("\n--- Executing Format List Command ---")
|
|
||||||
try:
|
|
||||||
# Use os.popen for simplicity, capture output
|
|
||||||
logger.info(f"Running: {format_cmd}")
|
|
||||||
format_output = os.popen(format_cmd).read()
|
|
||||||
logger.info("--- Format List Output ---")
|
|
||||||
logger.info(format_output)
|
|
||||||
logger.info("--------------------------")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error executing format command: {e}")
|
|
||||||
|
|
||||||
# Command to simulate download
|
|
||||||
simulate_cmd = f"{base_cmd} --simulate --verbose" # Add verbose for more info
|
|
||||||
result['commands']['simulate'] = simulate_cmd
|
|
||||||
logger.info(f"Simulate download command: {simulate_cmd}")
|
|
||||||
|
|
||||||
# Execute and log the simulation command
|
|
||||||
logger.info("\n--- Executing Simulation Command ---")
|
|
||||||
try:
|
|
||||||
logger.info(f"Running: {simulate_cmd}")
|
|
||||||
simulate_output = os.popen(simulate_cmd).read()
|
|
||||||
logger.info("--- Simulation Output ---")
|
|
||||||
logger.info(simulate_output)
|
|
||||||
logger.info("-------------------------")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error executing simulation command: {e}")
|
|
||||||
|
|
||||||
# Basic download command
|
|
||||||
download_cmd = base_cmd
|
|
||||||
result['commands']['download_base'] = download_cmd
|
|
||||||
logger.info(f"Base download command (add format selection, output path): {download_cmd}")
|
|
||||||
|
|
||||||
# Push generated example commands to XCom for potential downstream use
|
|
||||||
# ti.xcom_push(key='format_cmd', value=format_cmd) # Removed as requested
|
|
||||||
# ti.xcom_push(key='simulate_cmd', value=simulate_cmd) # Removed as requested
|
|
||||||
ti.xcom_push(key='download_cmd', value=download_cmd)
|
|
||||||
logger.info(f"Pushed key 'download_cmd' to XCom with value: {download_cmd}")
|
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
error_msg = f"Failed to parse info.json file '{info_json_path}': {e}"
|
|
||||||
logger.error(error_msg)
|
|
||||||
result['error'] = error_msg
|
|
||||||
except FileNotFoundError:
|
|
||||||
error_msg = f"Info.json file not found at path: {info_json_path}"
|
|
||||||
logger.error(error_msg)
|
|
||||||
result['error'] = error_msg
|
|
||||||
except Exception as e:
|
|
||||||
error_msg = f"Error processing info.json file '{info_json_path}': {str(e)}"
|
|
||||||
logger.error(error_msg, exc_info=True)
|
|
||||||
result['error'] = error_msg
|
|
||||||
elif info_json_path:
|
|
||||||
error_msg = f"Info.json path provided ('{info_json_path}') but file does not exist."
|
|
||||||
logger.warning(error_msg)
|
|
||||||
result['error'] = error_msg
|
|
||||||
else:
|
|
||||||
logger.warning("No info.json path found in XCom. Cannot display video details or generate example commands.")
|
|
||||||
result['error'] = "Info.json path not available."
|
|
||||||
|
|
||||||
logger.info("Finished display_token_info task.")
|
|
||||||
# Return the collected information (useful if used as a PythonOperator return value)
|
|
||||||
return json.dumps(result) # Return as JSON string for XCom compatibility if needed
|
|
||||||
|
|
||||||
|
|
||||||
def store_token_info(**context):
|
|
||||||
"""Stores retrieved token information (command, proxy, info.json) in Redis."""
|
|
||||||
ti = context['task_instance']
|
|
||||||
# Use the redis_conn_id defined in the operator/DAG params if possible, else default
|
|
||||||
redis_conn_id = context['params'].get('redis_conn_id', 'redis_default')
|
|
||||||
redis_hook = RedisHook(redis_conn_id=redis_conn_id)
|
|
||||||
logger.info(f"Starting store_token_info task using Redis connection '{redis_conn_id}'.")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Pull necessary data from XCom and context
|
|
||||||
url = context['params'].get('url')
|
|
||||||
if not url:
|
|
||||||
# Attempt to get URL from DAG run conf as fallback
|
|
||||||
url = context.get('dag_run', {}).conf.get('url')
|
|
||||||
if not url:
|
|
||||||
raise ValueError("URL parameter is missing in context['params'] and dag_run.conf")
|
|
||||||
logger.warning("URL parameter missing in context['params'], using URL from dag_run.conf.")
|
|
||||||
|
|
||||||
|
|
||||||
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
|
|
||||||
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') or '' # Default to empty string if None
|
|
||||||
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
|
|
||||||
|
|
||||||
if not ytdlp_command:
|
|
||||||
logger.warning("ytdlp_command not found in XCom. Storing empty value.")
|
|
||||||
ytdlp_command = '' # Store empty if not found
|
|
||||||
|
|
||||||
# Construct the base command using info.json
|
|
||||||
ytdlp_command_base = ''
|
|
||||||
if info_json_path and os.path.exists(info_json_path):
|
|
||||||
ytdlp_command_base = f"yt-dlp --load-info-json \"{info_json_path}\""
|
|
||||||
logger.info(f"Constructed base command: {ytdlp_command_base}")
|
|
||||||
else:
|
|
||||||
logger.warning("Cannot construct base command: info_json_path not valid.")
|
|
||||||
|
|
||||||
# Construct the command with tokens and proxy
|
|
||||||
ytdlp_command_tokens = ytdlp_command # Start with original command from server
|
|
||||||
if socks_proxy:
|
|
||||||
ytdlp_command_tokens += f" --proxy \"{socks_proxy}\""
|
|
||||||
logger.info("Appended proxy to token command.")
|
|
||||||
|
|
||||||
data_to_store = {
|
|
||||||
'url': url,
|
|
||||||
'ytdlp_command': ytdlp_command_base, # Store the base command
|
|
||||||
'proxy': socks_proxy,
|
|
||||||
'info_json_path': info_json_path or '' # Store path even if None/empty
|
|
||||||
# 'info_json' will be added below
|
|
||||||
}
|
|
||||||
|
|
||||||
# Read info.json content if path exists
|
|
||||||
info_json_content = None
|
|
||||||
if info_json_path and os.path.exists(info_json_path):
|
|
||||||
try:
|
|
||||||
with open(info_json_path, 'r', encoding='utf-8') as f:
|
|
||||||
# Read and immediately validate JSON structure before storing
|
|
||||||
info_json_content = json.load(f)
|
|
||||||
# Store the validated JSON as a string
|
|
||||||
data_to_store['info_json'] = json.dumps(info_json_content)
|
|
||||||
logger.info(f"Read and validated info.json content from: {info_json_path}")
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.error(f"Failed to parse info.json file '{info_json_path}' as JSON: {e}. Storing empty content.")
|
|
||||||
data_to_store['info_json'] = '' # Store empty string on parse error
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to read info.json file '{info_json_path}': {e}. Storing empty content.")
|
|
||||||
data_to_store['info_json'] = '' # Store empty string on other read errors
|
|
||||||
else:
|
|
||||||
logger.warning(f"info_json_path ('{info_json_path}') not found or invalid. Storing without info_json content.")
|
|
||||||
data_to_store['info_json'] = '' # Store empty string if no path
|
|
||||||
|
|
||||||
# Determine Redis key using video ID
|
|
||||||
# Use the same helper method as the operator for consistency
|
|
||||||
# Need an instance or static method call. Let's make _extract_video_id static temporarily
|
|
||||||
# Or instantiate the operator just for this - less ideal.
|
|
||||||
# Simplest: Re-implement or assume utils.
|
|
||||||
# Re-implementing basic logic here for simplicity:
|
|
||||||
video_id = None
|
|
||||||
try:
|
|
||||||
if 'youtube.com/watch?v=' in url:
|
|
||||||
video_id = url.split('v=')[1].split('&')[0][:11]
|
|
||||||
elif 'youtu.be/' in url:
|
|
||||||
video_id = url.split('youtu.be/')[1].split('?')[0][:11]
|
|
||||||
except Exception:
|
|
||||||
pass # Ignore errors in ID extraction for key generation
|
|
||||||
redis_key = f"token_info:{video_id or 'unknown'}"
|
|
||||||
logger.info(f"Determined Redis key: {redis_key}")
|
|
||||||
|
|
||||||
# Store data in Redis hash
|
|
||||||
# Log presence/absence rather than full content for potentially large fields
|
|
||||||
logger.info(f"Data to store in Redis key '{redis_key}': "
|
|
||||||
f"URL='{data_to_store['url']}', "
|
|
||||||
f"Command={'<present>' if data_to_store['ytdlp_command'] else '<empty>'}, "
|
|
||||||
f"Proxy='{data_to_store['proxy'] or '<empty>'}', "
|
|
||||||
f"Path='{data_to_store['info_json_path'] or '<empty>'}', "
|
|
||||||
f"JSON Content={'<present>' if data_to_store.get('info_json') else '<empty>'}")
|
|
||||||
|
|
||||||
with redis_hook.get_conn() as redis_client:
|
|
||||||
# Extract video ID from URL
|
|
||||||
video_id = None
|
|
||||||
try:
|
|
||||||
if 'youtube.com/watch?v=' in url:
|
|
||||||
video_id = url.split('v=')[1].split('&')[0][:11]
|
|
||||||
elif 'youtu.be/' in url:
|
|
||||||
video_id = url.split('youtu.be/')[1].split('?')[0][:11]
|
|
||||||
except Exception:
|
|
||||||
pass # Ignore errors in ID extraction for key generation
|
|
||||||
|
|
||||||
# Use video ID as part of the Redis key
|
|
||||||
redis_key = f"token_info:{video_id or 'unknown'}"
|
|
||||||
logger.info(f"Determined Redis key: {redis_key}")
|
|
||||||
|
|
||||||
# Store data in Redis hash
|
|
||||||
# Add video_id, timestamp, and the constructed ytdlp_command_tokens
|
|
||||||
data_to_store['video_id'] = video_id or 'unknown'
|
|
||||||
data_to_store['timestamp'] = int(time.time())
|
|
||||||
data_to_store['ytdlp_command_tokens'] = ytdlp_command_tokens # Store the original token command
|
|
||||||
|
|
||||||
# Log fields being stored
|
|
||||||
log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
|
|
||||||
logger.info(f"Storing in Redis key '{redis_key}': {log_data}")
|
|
||||||
|
|
||||||
redis_client.hset(redis_key, mapping=data_to_store)
|
|
||||||
# Set expiration (e.g., 24 hours = 86400 seconds)
|
|
||||||
redis_client.expire(redis_key, 86400)
|
|
||||||
logger.info(f"Successfully stored token info in Redis key '{redis_key}' with 24h expiration.")
|
|
||||||
# Log the final stored data again for clarity
|
|
||||||
final_log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
|
|
||||||
logger.info(f"--- Final Data Stored in Redis Key '{redis_key}' ---")
|
|
||||||
logger.info(final_log_data)
|
|
||||||
logger.info("----------------------------------------------------")
|
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to store token info in Redis: {e}", exc_info=True)
|
|
||||||
# Re-raise as AirflowException to fail the task
|
|
||||||
raise AirflowException(f"Failed to store token info in Redis: {e}")
|
|
||||||
|
|
||||||
logger.info("Finished store_token_info task.")
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# DAG Definition
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
# Update default_args to match ytdlp_client_dag.py structure
|
|
||||||
default_args = {
|
|
||||||
'owner': 'airflow',
|
|
||||||
'depends_on_past': False,
|
|
||||||
'email_on_failure': False, # Match reference DAG
|
|
||||||
'email_on_retry': False, # Match reference DAG
|
|
||||||
'retries': 1, # Default task retries
|
|
||||||
'retry_delay': timedelta(minutes=5), # Standard task retry delay
|
|
||||||
'start_date': days_ago(1) # Best practice start date
|
|
||||||
}
|
|
||||||
|
|
||||||
# Update DAG definition
|
|
||||||
with DAG(
|
|
||||||
dag_id='ytdlp_client_dag_v2.1',
|
|
||||||
default_args=default_args,
|
|
||||||
schedule_interval=None, # Manually triggered DAG
|
|
||||||
catchup=False, # Don't run for past missed schedules
|
|
||||||
description='DAG for YTDLP operations using Thrift client (V2 - Refactored)', # Updated description
|
|
||||||
tags=['ytdlp', 'thrift', 'client', 'v2'], # Updated tags for better filtering
|
|
||||||
params={
|
|
||||||
# Define DAG parameters with defaults and types for UI clarity
|
|
||||||
'url': Param('https://www.youtube.com/watch?v=sOlTX9uxUtM', type=["null", "string"], description="Required: The video URL to process."), # Default URL
|
|
||||||
'redis_enabled': Param(False, type="boolean", description="Use Redis for service discovery? If False, uses service_ip/port."), # Default to direct connection
|
|
||||||
'service_ip': Param('85.192.30.55', type="string", description="Service IP if redis_enabled=False."), # Default service IP
|
|
||||||
'service_port': Param(9090, type="integer", description="Service port if redis_enabled=False."), # Default service port
|
|
||||||
'account_id': Param('account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09', type="string", description="Account ID for Redis lookup or direct call."), # Updated default account_id
|
|
||||||
'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
|
|
||||||
# Use Airflow Variable for downloads directory, matching reference DAG structure
|
|
||||||
'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json. Uses Airflow Variable 'DOWNLOADS_TEMP' or default.")
|
|
||||||
}
|
|
||||||
) as dag:
|
|
||||||
|
|
||||||
# Define Tasks
|
|
||||||
|
|
||||||
get_token = YtdlpOpsOperator(
|
|
||||||
task_id='get_token',
|
|
||||||
# Pass templated parameters from DAG run config
|
|
||||||
url="{{ params.url }}",
|
|
||||||
redis_enabled="{{ params.redis_enabled }}",
|
|
||||||
service_ip="{{ params.service_ip }}",
|
|
||||||
service_port="{{ params.service_port }}",
|
|
||||||
account_id="{{ params.account_id }}",
|
|
||||||
save_info_json=True,
|
|
||||||
info_json_dir="{{ params.info_json_dir }}",
|
|
||||||
get_socks_proxy=True,
|
|
||||||
store_socks_proxy=True,
|
|
||||||
timeout="{{ params.timeout }}",
|
|
||||||
retries=MAX_RETRIES, # Operator-specific retries if needed, else use DAG default
|
|
||||||
retry_delay=RETRY_DELAY, # Operator-specific delay if needed
|
|
||||||
# Add callbacks for logging success/failure, similar to reference DAG
|
|
||||||
on_failure_callback=lambda context: logger.error(f"Task {context['task_instance_key_str']} failed."),
|
|
||||||
on_success_callback=lambda context: logger.info(f"Task {context['task_instance_key_str']} succeeded.")
|
|
||||||
)
|
|
||||||
# Add task documentation (visible in Airflow UI)
|
|
||||||
get_token.doc_md = """
|
|
||||||
### Get Token Task
|
|
||||||
Connects to the YTDLP Thrift service (either directly or via Redis discovery)
|
|
||||||
to retrieve an authentication token and video metadata (info.json).
|
|
||||||
|
|
||||||
**Pushes to XCom:**
|
|
||||||
- `info_json_path`: Path to the saved info.json file (or None if not saved/failed).
|
|
||||||
- `socks_proxy`: The extracted SOCKS proxy string (or None if not requested/found).
|
|
||||||
- `ytdlp_command`: The original command string received from the server (contains tokens/cookies).
|
|
||||||
|
|
||||||
- Uses parameters defined in the DAG run configuration.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Optional: Add a task to explicitly check XComs for debugging (like in reference DAG)
|
|
||||||
def _check_xcom_callable(**context):
|
|
||||||
"""Logs XCom values pushed by the get_token task."""
|
|
||||||
ti = context['task_instance']
|
|
||||||
logger.info("--- Checking XCom values pushed by get_token ---")
|
|
||||||
keys_to_check = ['info_json_path', 'socks_proxy', 'ytdlp_command']
|
|
||||||
xcom_values = {}
|
|
||||||
for key in keys_to_check:
|
|
||||||
value = ti.xcom_pull(task_ids='get_token', key=key)
|
|
||||||
xcom_values[key] = value
|
|
||||||
# Avoid logging potentially sensitive command details fully in production
|
|
||||||
if key == 'ytdlp_command' and value:
|
|
||||||
log_value = f"{value[:50]}..." # Log truncated command
|
|
||||||
else:
|
|
||||||
log_value = value
|
|
||||||
logger.info(f"XCom key='{key}': {log_value}")
|
|
||||||
logger.info("----------------------------------------------")
|
|
||||||
return xcom_values # Return values for potential future use
|
|
||||||
|
|
||||||
check_xcom_task = PythonOperator(
|
|
||||||
task_id='check_xcom_after_get_token',
|
|
||||||
python_callable=_check_xcom_callable,
|
|
||||||
)
|
|
||||||
check_xcom_task.doc_md = "Logs the values pushed to XCom by the 'get_token' task for debugging purposes."
|
|
||||||
|
|
||||||
display_info = PythonOperator(
|
|
||||||
task_id='display_token_info',
|
|
||||||
python_callable=display_token_info,
|
|
||||||
trigger_rule='all_success'
|
|
||||||
)
|
|
||||||
display_info.doc_md = """
|
|
||||||
### Display Token Info Task
|
|
||||||
Pulls information from XCom, parses the `info.json` file (if available),
|
|
||||||
logs video details, and generates example `yt-dlp` commands.
|
|
||||||
|
|
||||||
**Pulls from XCom (task_id='get_token'):**
|
|
||||||
- `info_json_path`
|
|
||||||
- `socks_proxy`
|
|
||||||
- `ytdlp_command`
|
|
||||||
|
|
||||||
**Pushes to XCom:**
|
|
||||||
- `download_cmd`: Base command using `--load-info-json` (user needs to add format/output).
|
|
||||||
"""
|
|
||||||
|
|
||||||
store_info = PythonOperator(
|
|
||||||
task_id='store_token_info', # Use consistent task ID naming
|
|
||||||
python_callable=store_token_info,
|
|
||||||
)
|
|
||||||
store_info.doc_md = """
|
|
||||||
### Store Token Info Task
|
|
||||||
Pulls information from XCom and DAG parameters, reads the `info.json` content,
|
|
||||||
and stores relevant data in a Redis hash.
|
|
||||||
|
|
||||||
**Pulls from XCom (task_id='get_token'):**
|
|
||||||
- `ytdlp_command`
|
|
||||||
- `socks_proxy`
|
|
||||||
- `info_json_path`
|
|
||||||
|
|
||||||
**Pulls from DAG context:**
|
|
||||||
- `params['url']` (or `dag_run.conf['url']`)
|
|
||||||
|
|
||||||
**Stores in Redis Hash (key: `token_info:<video_id>`):**
|
|
||||||
- `url`: The video URL.
|
|
||||||
- `ytdlp_command`: Base command using `--load-info-json`.
|
|
||||||
- `proxy`: The SOCKS proxy string.
|
|
||||||
- `info_json_path`: Path to the saved info.json file.
|
|
||||||
- `info_json`: The full content of the info.json file (as a JSON string).
|
|
||||||
- `video_id`: Extracted video ID.
|
|
||||||
- `timestamp`: Unix timestamp of storage.
|
|
||||||
- `ytdlp_command_tokens`: The original command string from the server (contains tokens/cookies).
|
|
||||||
|
|
||||||
Sets a 24-hour expiration on the Redis key.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Define task dependencies matching the reference DAG structure
|
|
||||||
get_token >> check_xcom_task >> display_info >> store_info
|
|
||||||
197
dags/ytdlp_mgmt_proxy.py
Normal file
197
dags/ytdlp_mgmt_proxy.py
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
"""
|
||||||
|
DAG to manage the state of proxies used by the ytdlp-ops-server.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from airflow.models.dag import DAG
|
||||||
|
from airflow.models.param import Param
|
||||||
|
from airflow.operators.python import PythonOperator
|
||||||
|
from airflow.utils.dates import days_ago
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Import and apply Thrift exceptions patch for Airflow compatibility
|
||||||
|
try:
|
||||||
|
from thrift_exceptions_patch import patch_thrift_exceptions
|
||||||
|
patch_thrift_exceptions()
|
||||||
|
logger.info("Applied Thrift exceptions patch for Airflow compatibility.")
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("Could not import thrift_exceptions_patch. Compatibility may be affected.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error applying Thrift exceptions patch: {e}")
|
||||||
|
|
||||||
|
# Thrift imports
|
||||||
|
try:
|
||||||
|
from thrift.transport import TSocket, TTransport
|
||||||
|
from thrift.protocol import TBinaryProtocol
|
||||||
|
from pangramia.yt.tokens_ops import YTTokenOpService
|
||||||
|
from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
|
||||||
|
except ImportError as e:
|
||||||
|
logger.critical(f"Could not import Thrift modules: {e}. Ensure ytdlp-ops-auth package is installed.")
|
||||||
|
# Fail DAG parsing if thrift modules are not available
|
||||||
|
raise
|
||||||
|
|
||||||
|
def format_timestamp(ts_str: str) -> str:
|
||||||
|
"""Formats a string timestamp into a human-readable date string."""
|
||||||
|
if not ts_str:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
ts_float = float(ts_str)
|
||||||
|
if ts_float <= 0:
|
||||||
|
return ""
|
||||||
|
# Use datetime from the imported 'from datetime import datetime'
|
||||||
|
dt_obj = datetime.fromtimestamp(ts_float)
|
||||||
|
return dt_obj.strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return ts_str # Return original string if conversion fails
|
||||||
|
|
||||||
|
def get_thrift_client(host: str, port: int):
|
||||||
|
"""Helper function to create and connect a Thrift client."""
|
||||||
|
transport = TSocket.TSocket(host, port)
|
||||||
|
transport = TTransport.TFramedTransport(transport)
|
||||||
|
protocol = TBinaryProtocol.TBinaryProtocol(transport)
|
||||||
|
client = YTTokenOpService.Client(protocol)
|
||||||
|
transport.open()
|
||||||
|
logger.info(f"Connected to Thrift server at {host}:{port}")
|
||||||
|
return client, transport
|
||||||
|
|
||||||
|
def manage_proxies_callable(**context):
|
||||||
|
"""Main callable to interact with the proxy management endpoints."""
|
||||||
|
params = context["params"]
|
||||||
|
action = params["action"]
|
||||||
|
host = params["host"]
|
||||||
|
port = params["port"]
|
||||||
|
server_identity = params.get("server_identity")
|
||||||
|
proxy_url = params.get("proxy_url")
|
||||||
|
|
||||||
|
if not server_identity and action in ["ban", "unban", "reset_all"]:
|
||||||
|
raise ValueError(f"A 'server_identity' is required for the '{action}' action.")
|
||||||
|
|
||||||
|
client, transport = None, None
|
||||||
|
try:
|
||||||
|
client, transport = get_thrift_client(host, port)
|
||||||
|
|
||||||
|
if action == "list":
|
||||||
|
logger.info(f"Listing proxy statuses for server: {server_identity or 'ALL'}")
|
||||||
|
statuses = client.getProxyStatus(server_identity)
|
||||||
|
if not statuses:
|
||||||
|
logger.info("No proxy statuses found.")
|
||||||
|
print("No proxy statuses found.")
|
||||||
|
else:
|
||||||
|
from tabulate import tabulate
|
||||||
|
status_list = [
|
||||||
|
{
|
||||||
|
"Server": s.serverIdentity,
|
||||||
|
"Proxy URL": s.proxyUrl,
|
||||||
|
"Status": s.status,
|
||||||
|
"Success": s.successCount,
|
||||||
|
"Failures": s.failureCount,
|
||||||
|
"Last Success": format_timestamp(s.lastSuccessTimestamp),
|
||||||
|
"Last Failure": format_timestamp(s.lastFailureTimestamp),
|
||||||
|
}
|
||||||
|
for s in statuses
|
||||||
|
]
|
||||||
|
print("\n--- Proxy Statuses ---")
|
||||||
|
print(tabulate(status_list, headers="keys", tablefmt="grid"))
|
||||||
|
print("----------------------\n")
|
||||||
|
|
||||||
|
elif action == "ban":
|
||||||
|
if not proxy_url:
|
||||||
|
raise ValueError("A 'proxy_url' is required to ban a proxy.")
|
||||||
|
logger.info(f"Banning proxy '{proxy_url}' for server '{server_identity}'...")
|
||||||
|
success = client.banProxy(proxy_url, server_identity)
|
||||||
|
if success:
|
||||||
|
logger.info("Successfully banned proxy.")
|
||||||
|
print(f"Successfully banned proxy '{proxy_url}' for server '{server_identity}'.")
|
||||||
|
else:
|
||||||
|
logger.error("Failed to ban proxy.")
|
||||||
|
raise Exception("Server returned failure for banProxy operation.")
|
||||||
|
|
||||||
|
elif action == "unban":
|
||||||
|
if not proxy_url:
|
||||||
|
raise ValueError("A 'proxy_url' is required to unban a proxy.")
|
||||||
|
logger.info(f"Unbanning proxy '{proxy_url}' for server '{server_identity}'...")
|
||||||
|
success = client.unbanProxy(proxy_url, server_identity)
|
||||||
|
if success:
|
||||||
|
logger.info("Successfully unbanned proxy.")
|
||||||
|
print(f"Successfully unbanned proxy '{proxy_url}' for server '{server_identity}'.")
|
||||||
|
else:
|
||||||
|
logger.error("Failed to unban proxy.")
|
||||||
|
raise Exception("Server returned failure for unbanProxy operation.")
|
||||||
|
|
||||||
|
elif action == "reset_all":
|
||||||
|
logger.info(f"Resetting all proxy statuses for server '{server_identity}'...")
|
||||||
|
success = client.resetAllProxyStatuses(server_identity)
|
||||||
|
if success:
|
||||||
|
logger.info("Successfully reset all proxy statuses.")
|
||||||
|
print(f"Successfully reset all proxy statuses for server '{server_identity}'.")
|
||||||
|
else:
|
||||||
|
logger.error("Failed to reset all proxy statuses.")
|
||||||
|
raise Exception("Server returned failure for resetAllProxyStatuses operation.")
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid action: {action}")
|
||||||
|
|
||||||
|
except (PBServiceException, PBUserException) as e:
|
||||||
|
logger.error(f"Thrift error performing action '{action}': {e.message}", exc_info=True)
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error performing action '{action}': {e}", exc_info=True)
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
if transport and transport.isOpen():
|
||||||
|
transport.close()
|
||||||
|
logger.info("Thrift connection closed.")
|
||||||
|
|
||||||
|
with DAG(
|
||||||
|
dag_id="ytdlp_mgmt_proxy",
|
||||||
|
start_date=days_ago(1),
|
||||||
|
schedule=None,
|
||||||
|
catchup=False,
|
||||||
|
tags=["ytdlp", "utility", "proxy"],
|
||||||
|
doc_md="""
|
||||||
|
### YT-DLP Proxy Manager DAG
|
||||||
|
|
||||||
|
This DAG provides tools to manage the state of proxies used by the `ytdlp-ops-server`.
|
||||||
|
You can view statuses, and manually ban, unban, or reset proxies for a specific server instance.
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `host`: The hostname or IP of the `ytdlp-ops-server` Thrift service.
|
||||||
|
- `port`: The port of the Thrift service.
|
||||||
|
- `action`: The operation to perform.
|
||||||
|
- `list`: List proxy statuses. Provide a `server_identity` to query a specific server, or leave it blank to query the server instance you are connected to.
|
||||||
|
- `ban`: Ban a specific proxy. Requires `server_identity` and `proxy_url`.
|
||||||
|
- `unban`: Un-ban a specific proxy. Requires `server_identity` and `proxy_url`.
|
||||||
|
- `reset_all`: Reset all proxies for a server to `ACTIVE`. Requires `server_identity`.
|
||||||
|
- `server_identity`: The unique identifier for the server instance (e.g., `ytdlp-ops-airflow-service`).
|
||||||
|
- `proxy_url`: The full URL of the proxy to act upon (e.g., `socks5://host:port`).
|
||||||
|
""",
|
||||||
|
params={
|
||||||
|
"host": Param("89.253.221.173", type="string", description="The hostname of the ytdlp-ops-server service."),
|
||||||
|
"port": Param(9090, type="integer", description="The port of the ytdlp-ops-server service."),
|
||||||
|
"action": Param(
|
||||||
|
"list",
|
||||||
|
type="string",
|
||||||
|
enum=["list", "ban", "unban", "reset_all"],
|
||||||
|
description="The management action to perform.",
|
||||||
|
),
|
||||||
|
"server_identity": Param(
|
||||||
|
"ytdlp-ops-airflow-service",
|
||||||
|
type=["null", "string"],
|
||||||
|
description="The identity of the server to manage. Leave blank to query the connected server instance.",
|
||||||
|
),
|
||||||
|
"proxy_url": Param(
|
||||||
|
None,
|
||||||
|
type=["null", "string"],
|
||||||
|
description="The proxy URL to ban/unban (e.g., 'socks5://host:port').",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
) as dag:
|
||||||
|
proxy_management_task = PythonOperator(
|
||||||
|
task_id="proxy_management_task",
|
||||||
|
python_callable=manage_proxies_callable,
|
||||||
|
)
|
||||||
@ -1,174 +0,0 @@
|
|||||||
from airflow import DAG
|
|
||||||
from airflow.models.param import Param
|
|
||||||
from airflow.operators.python import PythonOperator
|
|
||||||
from airflow.providers.redis.hooks.redis import RedisHook
|
|
||||||
from airflow.utils.dates import days_ago
|
|
||||||
from airflow.exceptions import AirflowException
|
|
||||||
from datetime import timedelta
|
|
||||||
import logging
|
|
||||||
import redis # Import redis exceptions if needed
|
|
||||||
|
|
||||||
# Import utility functions
|
|
||||||
from utils.redis_utils import _get_redis_client
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Default settings
|
|
||||||
DEFAULT_QUEUE_NAME = 'video_queue' # Default base name for the queue
|
|
||||||
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
|
||||||
|
|
||||||
# --- Python Callables for Tasks ---
|
|
||||||
|
|
||||||
def add_urls_callable(**context):
|
|
||||||
"""Adds URLs from comma/newline separated input to the specified Redis inbox list."""
|
|
||||||
params = context['params']
|
|
||||||
redis_conn_id = params['redis_conn_id']
|
|
||||||
queue_name = params['queue_name']
|
|
||||||
inbox_queue = f"{queue_name}_inbox"
|
|
||||||
urls_input = params['urls']
|
|
||||||
|
|
||||||
if not urls_input or not isinstance(urls_input, str):
|
|
||||||
logger.warning("No URLs provided or 'urls' parameter is not a string. Nothing to add.")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Process input: split by newline, then by comma, flatten, strip, and filter empty
|
|
||||||
urls_to_add = []
|
|
||||||
for line in urls_input.splitlines():
|
|
||||||
urls_to_add.extend(url.strip() for url in line.split(',') if url.strip())
|
|
||||||
|
|
||||||
# Remove duplicates while preserving order (optional, but good practice)
|
|
||||||
seen = set()
|
|
||||||
urls_to_add = [x for x in urls_to_add if not (x in seen or seen.add(x))]
|
|
||||||
|
|
||||||
if not urls_to_add:
|
|
||||||
logger.info("No valid URLs found after processing input. Nothing added.")
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.info(f"Attempting to add {len(urls_to_add)} unique URLs to Redis list '{inbox_queue}' using connection '{redis_conn_id}'.")
|
|
||||||
try:
|
|
||||||
redis_client = _get_redis_client(redis_conn_id)
|
|
||||||
# Use rpush to add to the end of the list (FIFO behavior with lpop)
|
|
||||||
added_count = redis_client.rpush(inbox_queue, *urls_to_add)
|
|
||||||
logger.info(f"Successfully added {len(urls_to_add)} URLs to list '{inbox_queue}'. New list length: {added_count}.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to add URLs to Redis list '{inbox_queue}': {e}", exc_info=True)
|
|
||||||
raise AirflowException(f"Failed to add URLs to Redis: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
# Removed clear_queue_callable as this DAG focuses on adding and verifying
|
|
||||||
|
|
||||||
|
|
||||||
def check_status_callable(**context):
|
|
||||||
"""Checks the type and length/size of the specified Redis inbox key."""
|
|
||||||
# Access DAG run parameters directly from context['params']
|
|
||||||
dag_params = context['params']
|
|
||||||
redis_conn_id = dag_params['redis_conn_id']
|
|
||||||
# This DAG verifies the inbox queue, so we construct the name from the base name
|
|
||||||
queue_name = dag_params['queue_name']
|
|
||||||
queue_to_check = f"{queue_name}_inbox"
|
|
||||||
|
|
||||||
if not queue_name:
|
|
||||||
raise ValueError("DAG parameter 'queue_name' (base name) cannot be empty.")
|
|
||||||
|
|
||||||
logger.info(f"Attempting to check status of Redis key '{queue_to_check}' using connection '{redis_conn_id}'.")
|
|
||||||
try:
|
|
||||||
# Use the resolved redis_conn_id to get the client
|
|
||||||
redis_client = _get_redis_client(redis_conn_id)
|
|
||||||
# redis_client.type returns bytes (e.g., b'list', b'hash', b'none')
|
|
||||||
key_type_bytes = redis_client.type(queue_to_check)
|
|
||||||
key_type_str = key_type_bytes.decode('utf-8') # Decode to string
|
|
||||||
|
|
||||||
length = 0
|
|
||||||
if key_type_str == 'list':
|
|
||||||
length = redis_client.llen(queue_to_check)
|
|
||||||
logger.info(f"Redis list '{queue_to_check}' has {length} items.")
|
|
||||||
elif key_type_str == 'hash':
|
|
||||||
length = redis_client.hlen(queue_to_check)
|
|
||||||
logger.info(f"Redis hash '{queue_to_check}' has {length} fields.")
|
|
||||||
elif key_type_str == 'none': # Check against the decoded string 'none'
|
|
||||||
logger.info(f"Redis key '{queue_to_check}' does not exist.")
|
|
||||||
else:
|
|
||||||
# Attempt to get size for other types if possible, e.g., set size
|
|
||||||
try:
|
|
||||||
if key_type_str == 'set':
|
|
||||||
length = redis_client.scard(queue_to_check)
|
|
||||||
logger.info(f"Redis set '{queue_to_check}' has {length} members.")
|
|
||||||
# Add checks for other types like zset if needed
|
|
||||||
else:
|
|
||||||
logger.info(f"Redis key '{queue_to_check}' exists but is of unhandled type '{key_type_str}'. Cannot determine size.")
|
|
||||||
except Exception as size_error:
|
|
||||||
logger.warning(f"Could not determine size for Redis key '{queue_to_check}' (type: {key_type_str}): {size_error}")
|
|
||||||
logger.info(f"Redis key '{queue_to_check}' exists but is of unhandled/unsizeable type '{key_type_str}'.")
|
|
||||||
|
|
||||||
# Push results to XCom
|
|
||||||
context['task_instance'].xcom_push(key='queue_key_type', value=key_type_str)
|
|
||||||
context['task_instance'].xcom_push(key='queue_size', value=length)
|
|
||||||
# Return status info using the resolved queue_to_check
|
|
||||||
return {'key': queue_to_check, 'type': key_type_str, 'size': length}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
# Log error using the resolved queue_to_check
|
|
||||||
logger.error(f"Failed to check status of Redis key '{queue_to_check}': {e}", exc_info=True)
|
|
||||||
raise AirflowException(f"Failed to check Redis key status: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
# --- DAG Definition ---
|
|
||||||
default_args = {
|
|
||||||
'owner': 'airflow',
|
|
||||||
'depends_on_past': False,
|
|
||||||
'email_on_failure': False,
|
|
||||||
'email_on_retry': False,
|
|
||||||
'retries': 1,
|
|
||||||
'retry_delay': timedelta(minutes=1), # Slightly longer retry delay for management tasks
|
|
||||||
'start_date': days_ago(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
# This single DAG contains operators for different management actions,
|
|
||||||
# This DAG allows adding URLs and then checking the status of the target queue.
|
|
||||||
with DAG(
|
|
||||||
dag_id='ytdlp_mgmt_queue_add_and_verify', # Updated DAG ID
|
|
||||||
default_args=default_args,
|
|
||||||
schedule_interval=None, # Manually triggered
|
|
||||||
catchup=False,
|
|
||||||
description='Manually add URLs to a YTDLP inbox queue and verify the queue status.', # Updated description
|
|
||||||
tags=['ytdlp', 'queue', 'management', 'redis', 'manual', 'add', 'verify'], # Updated tags
|
|
||||||
params={
|
|
||||||
# Common params
|
|
||||||
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
|
|
||||||
# Params for adding URLs (and checking the same queue)
|
|
||||||
'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", title="Base Queue Name", description="Base name for the Redis queues (e.g., 'video_queue'). The DAG will add URLs to '{base_name}_inbox'."),
|
|
||||||
'urls': Param("", type="string", title="URLs to Add", description="Comma and/or newline separated list of video URLs.", multiline=True), # Updated description, keep multiline for UI
|
|
||||||
# Removed clear_queue_name param
|
|
||||||
# Removed check_queue_name param (will use queue_name)
|
|
||||||
}
|
|
||||||
) as dag:
|
|
||||||
|
|
||||||
add_urls_task = PythonOperator(
|
|
||||||
task_id='add_urls_to_queue',
|
|
||||||
python_callable=add_urls_callable,
|
|
||||||
# Pass only relevant params to the callable via context['params']
|
|
||||||
# Note: context['params'] automatically contains all DAG params
|
|
||||||
)
|
|
||||||
add_urls_task.doc_md = """
|
|
||||||
### Add URLs to Queue
|
|
||||||
Adds URLs from the `urls` parameter (comma/newline separated) to the Redis list specified by `queue_name`.
|
|
||||||
*Trigger this task manually via the UI and provide the URLs.*
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Removed clear_queue_task
|
|
||||||
|
|
||||||
check_status_task = PythonOperator(
|
|
||||||
task_id='check_queue_status_after_add',
|
|
||||||
python_callable=check_status_callable,
|
|
||||||
# No task-specific params needed; callable uses context['params'] directly.
|
|
||||||
)
|
|
||||||
check_status_task.doc_md = """
|
|
||||||
### Check Queue Status After Add
|
|
||||||
Checks the type and length/size of the Redis key specified by `queue_name` (the same queue URLs were added to).
|
|
||||||
Logs the result and pushes `queue_key_type` and `queue_size` to XCom.
|
|
||||||
*This task runs automatically after `add_urls_to_queue`.*
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Define dependency: Add URLs first, then check status
|
|
||||||
add_urls_task >> check_status_task
|
|
||||||
@ -1,179 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# vim:fenc=utf-8
|
|
||||||
#
|
|
||||||
# Copyright © 2024 rl <rl@rlmbp>
|
|
||||||
#
|
|
||||||
# Distributed under terms of the MIT license.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Airflow DAG for manually checking the status (type and size) of a specific Redis key used by YTDLP queues.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from airflow import DAG
|
|
||||||
from airflow.exceptions import AirflowException
|
|
||||||
from airflow.models.param import Param
|
|
||||||
from airflow.operators.python import PythonOperator
|
|
||||||
from airflow.providers.redis.hooks.redis import RedisHook
|
|
||||||
from airflow.utils.dates import days_ago
|
|
||||||
from datetime import datetime, timedelta, timezone
|
|
||||||
import logging
|
|
||||||
import json
|
|
||||||
import redis # Import redis exceptions if needed
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Default settings
|
|
||||||
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
|
||||||
DEFAULT_QUEUE_BASE_NAME = 'video_queue'
|
|
||||||
DEFAULT_MAX_ITEMS_TO_LIST = 25
|
|
||||||
|
|
||||||
# Import utility functions
|
|
||||||
from utils.redis_utils import _get_redis_client
|
|
||||||
|
|
||||||
# --- Python Callable for Check and List Task ---
|
|
||||||
|
|
||||||
def check_and_list_queue_callable(**context):
|
|
||||||
"""Checks the type and size of a Redis key and lists its recent contents."""
|
|
||||||
params = context['params']
|
|
||||||
redis_conn_id = params['redis_conn_id']
|
|
||||||
# queue_suffix is passed from the PythonOperator's op_kwargs, which are available in the context
|
|
||||||
queue_suffix = context['queue_suffix']
|
|
||||||
queue_name = params.get('queue_name', DEFAULT_QUEUE_BASE_NAME)
|
|
||||||
queue_to_check = f"{queue_name}{queue_suffix}"
|
|
||||||
max_items = int(params.get('max_items_to_list', DEFAULT_MAX_ITEMS_TO_LIST))
|
|
||||||
|
|
||||||
logger.info(f"--- Checking Status and Contents of Redis Key: '{queue_to_check}' ---")
|
|
||||||
logger.info(f"Using connection '{redis_conn_id}', listing up to {max_items} items.")
|
|
||||||
|
|
||||||
try:
|
|
||||||
redis_client = _get_redis_client(redis_conn_id)
|
|
||||||
key_type_bytes = redis_client.type(queue_to_check)
|
|
||||||
key_type = key_type_bytes.decode('utf-8')
|
|
||||||
|
|
||||||
if key_type == 'list':
|
|
||||||
list_length = redis_client.llen(queue_to_check)
|
|
||||||
logger.info(f"Redis key '{queue_to_check}' is a LIST with {list_length} items.")
|
|
||||||
if list_length > 0:
|
|
||||||
items_to_fetch = min(max_items, list_length)
|
|
||||||
# lrange with negative indices gets items from the end (most recent for rpush)
|
|
||||||
contents_bytes = redis_client.lrange(queue_to_check, -items_to_fetch, -1)
|
|
||||||
contents = [item.decode('utf-8') for item in contents_bytes]
|
|
||||||
contents.reverse() # Show most recent first
|
|
||||||
logger.info(f"--- Showing most recent {len(contents)} of {list_length} items ---")
|
|
||||||
for i, item in enumerate(contents):
|
|
||||||
logger.info(f" [recent_{i}]: {item}")
|
|
||||||
if list_length > len(contents):
|
|
||||||
logger.info(f" ... ({list_length - len(contents)} older items not shown)")
|
|
||||||
logger.info(f"--- End of List Contents ---")
|
|
||||||
|
|
||||||
elif key_type == 'hash':
|
|
||||||
hash_size = redis_client.hlen(queue_to_check)
|
|
||||||
logger.info(f"Redis key '{queue_to_check}' is a HASH with {hash_size} fields.")
|
|
||||||
if hash_size > 0:
|
|
||||||
logger.info(f"--- Showing a sample of up to {max_items} fields ---")
|
|
||||||
item_count = 0
|
|
||||||
# Using hscan_iter to safely iterate over hash fields, count is a hint
|
|
||||||
for field_bytes, value_bytes in redis_client.hscan_iter(queue_to_check, count=max_items):
|
|
||||||
if item_count >= max_items:
|
|
||||||
logger.info(f" ... (stopped listing after {max_items} items of {hash_size})")
|
|
||||||
break
|
|
||||||
field = field_bytes.decode('utf-8')
|
|
||||||
value = value_bytes.decode('utf-8')
|
|
||||||
# Try to pretty-print if value is JSON
|
|
||||||
try:
|
|
||||||
parsed_value = json.loads(value)
|
|
||||||
# Check for timestamp to show age
|
|
||||||
timestamp = parsed_value.get('end_time') or parsed_value.get('start_time')
|
|
||||||
age_str = ""
|
|
||||||
if timestamp:
|
|
||||||
age_seconds = (datetime.now(timezone.utc) - datetime.fromtimestamp(timestamp, timezone.utc)).total_seconds()
|
|
||||||
age_str = f" (age: {timedelta(seconds=age_seconds)})"
|
|
||||||
|
|
||||||
pretty_value = json.dumps(parsed_value, indent=2)
|
|
||||||
logger.info(f" Field '{field}'{age_str}:\n{pretty_value}")
|
|
||||||
except (json.JSONDecodeError, TypeError):
|
|
||||||
logger.info(f" Field '{field}': {value}")
|
|
||||||
item_count += 1
|
|
||||||
logger.info(f"--- End of Hash Contents ---")
|
|
||||||
|
|
||||||
elif key_type == 'none':
|
|
||||||
logger.info(f"Redis key '{queue_to_check}' does not exist.")
|
|
||||||
else:
|
|
||||||
logger.info(f"Redis key '{queue_to_check}' is of type '{key_type}'. Listing contents for this type is not implemented.")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to check/list contents of Redis key '{queue_to_check}': {e}", exc_info=True)
|
|
||||||
raise AirflowException(f"Failed to process Redis key: {e}")
|
|
||||||
|
|
||||||
# --- DAG Definition ---
|
|
||||||
default_args = {
|
|
||||||
'owner': 'airflow',
|
|
||||||
'depends_on_past': False,
|
|
||||||
'email_on_failure': False,
|
|
||||||
'email_on_retry': False,
|
|
||||||
'retries': 0, # No retries for a manual check/list operation
|
|
||||||
'start_date': days_ago(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
with DAG(
|
|
||||||
dag_id='ytdlp_mgmt_queues_check_status',
|
|
||||||
default_args=default_args,
|
|
||||||
schedule_interval=None, # Manually triggered
|
|
||||||
catchup=False,
|
|
||||||
description='Manually check the status and recent items of all YTDLP Redis queues for a given base name.',
|
|
||||||
tags=['ytdlp', 'queue', 'management', 'redis', 'manual', 'status', 'list'],
|
|
||||||
params={
|
|
||||||
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
|
|
||||||
'queue_name': Param(
|
|
||||||
DEFAULT_QUEUE_BASE_NAME,
|
|
||||||
type="string",
|
|
||||||
description="Base name for the Redis queues (e.g., 'video_queue')."
|
|
||||||
),
|
|
||||||
'max_items_to_list': Param(DEFAULT_MAX_ITEMS_TO_LIST, type="integer", description="Maximum number of recent items/fields to list from each queue."),
|
|
||||||
}
|
|
||||||
) as dag:
|
|
||||||
|
|
||||||
check_inbox_queue = PythonOperator(
|
|
||||||
task_id='check_inbox_queue',
|
|
||||||
python_callable=check_and_list_queue_callable,
|
|
||||||
op_kwargs={'queue_suffix': '_inbox'},
|
|
||||||
)
|
|
||||||
check_inbox_queue.doc_md = """
|
|
||||||
### Check Inbox Queue (`_inbox`)
|
|
||||||
Checks the status and lists the most recent URLs waiting to be processed.
|
|
||||||
The full queue name is `{{ params.queue_name }}_inbox`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
check_progress_queue = PythonOperator(
|
|
||||||
task_id='check_progress_queue',
|
|
||||||
python_callable=check_and_list_queue_callable,
|
|
||||||
op_kwargs={'queue_suffix': '_progress'},
|
|
||||||
)
|
|
||||||
check_progress_queue.doc_md = """
|
|
||||||
### Check Progress Queue (`_progress`)
|
|
||||||
Checks the status and lists a sample of URLs currently being processed.
|
|
||||||
The full queue name is `{{ params.queue_name }}_progress`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
check_result_queue = PythonOperator(
|
|
||||||
task_id='check_result_queue',
|
|
||||||
python_callable=check_and_list_queue_callable,
|
|
||||||
op_kwargs={'queue_suffix': '_result'},
|
|
||||||
)
|
|
||||||
check_result_queue.doc_md = """
|
|
||||||
### Check Result Queue (`_result`)
|
|
||||||
Checks the status and lists a sample of successfully processed URLs.
|
|
||||||
The full queue name is `{{ params.queue_name }}_result`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
check_fail_queue = PythonOperator(
|
|
||||||
task_id='check_fail_queue',
|
|
||||||
python_callable=check_and_list_queue_callable,
|
|
||||||
op_kwargs={'queue_suffix': '_fail'},
|
|
||||||
)
|
|
||||||
check_fail_queue.doc_md = """
|
|
||||||
### Check Fail Queue (`_fail`)
|
|
||||||
Checks the status and lists a sample of failed URLs.
|
|
||||||
The full queue name is `{{ params.queue_name }}_fail`.
|
|
||||||
"""
|
|
||||||
@ -1,99 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# vim:fenc=utf-8
|
|
||||||
#
|
|
||||||
# Copyright © 2024 rl <rl@rlmbp>
|
|
||||||
#
|
|
||||||
# Distributed under terms of the MIT license.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Airflow DAG for manually clearing (deleting) a specific Redis key used by YTDLP queues.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from airflow import DAG
|
|
||||||
from airflow.exceptions import AirflowException
|
|
||||||
from airflow.models.param import Param
|
|
||||||
from airflow.operators.python import PythonOperator
|
|
||||||
from airflow.providers.redis.hooks.redis import RedisHook
|
|
||||||
from airflow.utils.dates import days_ago
|
|
||||||
from datetime import timedelta
|
|
||||||
import logging
|
|
||||||
import redis # Import redis exceptions if needed
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Default settings
|
|
||||||
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
|
||||||
# Provide a placeholder default, user MUST specify the queue to clear
|
|
||||||
DEFAULT_QUEUE_TO_CLEAR = 'PLEASE_SPECIFY_QUEUE_TO_CLEAR'
|
|
||||||
|
|
||||||
# Import utility functions
|
|
||||||
from utils.redis_utils import _get_redis_client
|
|
||||||
|
|
||||||
# --- Python Callable for Clear Task ---
|
|
||||||
|
|
||||||
def clear_queue_callable(**context):
|
|
||||||
"""Clears (deletes) the specified Redis key (queue/hash)."""
|
|
||||||
params = context['params']
|
|
||||||
redis_conn_id = params['redis_conn_id']
|
|
||||||
queue_to_clear = params['queue_to_clear'] # Specific queue/hash name
|
|
||||||
|
|
||||||
if not queue_to_clear or queue_to_clear == DEFAULT_QUEUE_TO_CLEAR:
|
|
||||||
raise ValueError("Parameter 'queue_to_clear' must be specified and cannot be the default placeholder.")
|
|
||||||
|
|
||||||
logger.info(f"Attempting to clear Redis key '{queue_to_clear}' using connection '{redis_conn_id}'.")
|
|
||||||
try:
|
|
||||||
redis_client = _get_redis_client(redis_conn_id)
|
|
||||||
deleted_count = redis_client.delete(queue_to_clear)
|
|
||||||
if deleted_count > 0:
|
|
||||||
logger.info(f"Successfully cleared Redis key '{queue_to_clear}'.")
|
|
||||||
else:
|
|
||||||
logger.info(f"Redis key '{queue_to_clear}' did not exist or was already empty.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to clear Redis key '{queue_to_clear}': {e}", exc_info=True)
|
|
||||||
raise AirflowException(f"Failed to clear Redis key: {e}")
|
|
||||||
|
|
||||||
# --- DAG Definition ---
|
|
||||||
default_args = {
|
|
||||||
'owner': 'airflow',
|
|
||||||
'depends_on_past': False,
|
|
||||||
'email_on_failure': False,
|
|
||||||
'email_on_retry': False,
|
|
||||||
'retries': 0, # No retries for manual clear operation
|
|
||||||
'start_date': days_ago(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
with DAG(
|
|
||||||
dag_id='ytdlp_mgmt_queue_clear',
|
|
||||||
default_args=default_args,
|
|
||||||
schedule_interval=None, # Manually triggered
|
|
||||||
catchup=False,
|
|
||||||
description='Manually clear/delete a specific YTDLP Redis queue/key (inbox, progress, result, fail). Use with caution!',
|
|
||||||
tags=['ytdlp', 'queue', 'management', 'redis', 'manual', 'clear'],
|
|
||||||
params={
|
|
||||||
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
|
|
||||||
'queue_to_clear': Param(
|
|
||||||
DEFAULT_QUEUE_TO_CLEAR,
|
|
||||||
type="string",
|
|
||||||
description="Exact name of the Redis key to clear (e.g., 'video_queue_inbox_account_xyz', 'video_queue_progress', 'video_queue_result', 'video_queue_fail')."
|
|
||||||
),
|
|
||||||
}
|
|
||||||
) as dag:
|
|
||||||
|
|
||||||
clear_queue_task = PythonOperator(
|
|
||||||
task_id='clear_specified_queue',
|
|
||||||
python_callable=clear_queue_callable,
|
|
||||||
# Params are implicitly passed via context['params']
|
|
||||||
)
|
|
||||||
clear_queue_task.doc_md = """
|
|
||||||
### Clear Specified Queue/Key Task
|
|
||||||
Deletes the Redis key specified by the `queue_to_clear` parameter.
|
|
||||||
This can target any key, including:
|
|
||||||
- `_inbox` (Redis List): Contains URLs waiting to be processed.
|
|
||||||
- `_progress` (Redis Hash): Contains URLs currently being processed.
|
|
||||||
- `_result` (Redis Hash): Contains details of successfully processed URLs.
|
|
||||||
- `_fail` (Redis Hash): Contains details of failed URLs.
|
|
||||||
|
|
||||||
**Warning:** This operation is destructive and cannot be undone. Ensure you specify the correct key name.
|
|
||||||
*Trigger this task manually via the UI.*
|
|
||||||
"""
|
|
||||||
@ -1,151 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# vim:fenc=utf-8
|
|
||||||
#
|
|
||||||
# Copyright © 2024 rl <rl@rlmbp>
|
|
||||||
#
|
|
||||||
# Distributed under terms of the MIT license.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Airflow DAG for manually listing the contents of a specific Redis key used by YTDLP queues.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from airflow import DAG
|
|
||||||
from airflow.exceptions import AirflowException
|
|
||||||
from airflow.models.param import Param
|
|
||||||
from airflow.operators.python import PythonOperator
|
|
||||||
from airflow.providers.redis.hooks.redis import RedisHook
|
|
||||||
from airflow.utils.dates import days_ago
|
|
||||||
from datetime import timedelta
|
|
||||||
import logging
|
|
||||||
import json
|
|
||||||
import redis # Import redis exceptions if needed
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Default settings
|
|
||||||
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
|
||||||
# Default to a common inbox pattern, user should override with the specific key
|
|
||||||
DEFAULT_QUEUE_TO_LIST = 'video_queue_inbox'
|
|
||||||
DEFAULT_MAX_ITEMS = 10 # Limit number of items listed by default
|
|
||||||
|
|
||||||
# Import utility functions
|
|
||||||
from utils.redis_utils import _get_redis_client
|
|
||||||
|
|
||||||
# --- Python Callable for List Contents Task ---
|
|
||||||
|
|
||||||
def list_contents_callable(**context):
|
|
||||||
"""Lists the contents of the specified Redis key (list or hash)."""
|
|
||||||
params = context['params']
|
|
||||||
redis_conn_id = params['redis_conn_id']
|
|
||||||
queue_to_list = params['queue_to_list']
|
|
||||||
max_items = params.get('max_items', DEFAULT_MAX_ITEMS)
|
|
||||||
|
|
||||||
if not queue_to_list:
|
|
||||||
raise ValueError("Parameter 'queue_to_list' cannot be empty.")
|
|
||||||
|
|
||||||
logger.info(f"Attempting to list contents of Redis key '{queue_to_list}' (max: {max_items}) using connection '{redis_conn_id}'.")
|
|
||||||
try:
|
|
||||||
redis_client = _get_redis_client(redis_conn_id)
|
|
||||||
key_type_bytes = redis_client.type(queue_to_list)
|
|
||||||
key_type = key_type_bytes.decode('utf-8') # Decode type
|
|
||||||
|
|
||||||
if key_type == 'list':
|
|
||||||
list_length = redis_client.llen(queue_to_list)
|
|
||||||
# Get the last N items, which are the most recently added with rpush
|
|
||||||
items_to_fetch = min(max_items, list_length)
|
|
||||||
# lrange with negative indices gets items from the end of the list.
|
|
||||||
# -N to -1 gets the last N items.
|
|
||||||
contents_bytes = redis_client.lrange(queue_to_list, -items_to_fetch, -1)
|
|
||||||
contents = [item.decode('utf-8') for item in contents_bytes]
|
|
||||||
# Reverse the list so the absolute most recent item is printed first
|
|
||||||
contents.reverse()
|
|
||||||
logger.info(f"--- Contents of Redis List '{queue_to_list}' (showing most recent {len(contents)} of {list_length}) ---")
|
|
||||||
for i, item in enumerate(contents):
|
|
||||||
# The index here is just for display, 0 is the most recent
|
|
||||||
logger.info(f" [recent_{i}]: {item}")
|
|
||||||
if list_length > len(contents):
|
|
||||||
logger.info(f" ... ({list_length - len(contents)} older items not shown)")
|
|
||||||
logger.info(f"--- End of List Contents ---")
|
|
||||||
# Optionally push contents to XCom if small enough
|
|
||||||
# context['task_instance'].xcom_push(key='list_contents', value=contents)
|
|
||||||
|
|
||||||
elif key_type == 'hash':
|
|
||||||
hash_size = redis_client.hlen(queue_to_list)
|
|
||||||
# HGETALL can be risky for large hashes. Consider HSCAN for production.
|
|
||||||
# For manual inspection, HGETALL is often acceptable.
|
|
||||||
if hash_size > max_items * 2: # Heuristic: avoid huge HGETALL
|
|
||||||
logger.warning(f"Hash '{queue_to_list}' has {hash_size} fields, which is large. Listing might be slow or incomplete. Consider using redis-cli HSCAN.")
|
|
||||||
# Optionally implement HSCAN here for large hashes
|
|
||||||
# hgetall returns dict of bytes keys and bytes values, decode them
|
|
||||||
contents_bytes = redis_client.hgetall(queue_to_list)
|
|
||||||
contents = {k.decode('utf-8'): v.decode('utf-8') for k, v in contents_bytes.items()}
|
|
||||||
logger.info(f"--- Contents of Redis Hash '{queue_to_list}' ({len(contents)} fields) ---")
|
|
||||||
item_count = 0
|
|
||||||
for key, value in contents.items(): # key and value are now strings
|
|
||||||
if item_count >= max_items:
|
|
||||||
logger.info(f" ... (stopped listing after {max_items} items of {hash_size})")
|
|
||||||
break
|
|
||||||
# Attempt to pretty-print if value is JSON
|
|
||||||
try:
|
|
||||||
parsed_value = json.loads(value)
|
|
||||||
pretty_value = json.dumps(parsed_value, indent=2)
|
|
||||||
logger.info(f" '{key}':\n{pretty_value}")
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
logger.info(f" '{key}': {value}") # Print as string if not JSON
|
|
||||||
item_count += 1
|
|
||||||
logger.info(f"--- End of Hash Contents ---")
|
|
||||||
# Optionally push contents to XCom if small enough
|
|
||||||
# context['task_instance'].xcom_push(key='hash_contents', value=contents)
|
|
||||||
|
|
||||||
elif key_type == 'none':
|
|
||||||
logger.info(f"Redis key '{queue_to_list}' does not exist.")
|
|
||||||
else:
|
|
||||||
logger.info(f"Redis key '{queue_to_list}' is of type '{key_type}'. Listing contents for this type is not implemented.")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to list contents of Redis key '{queue_to_list}': {e}", exc_info=True)
|
|
||||||
raise AirflowException(f"Failed to list Redis key contents: {e}")
|
|
||||||
|
|
||||||
# --- DAG Definition ---
|
|
||||||
default_args = {
|
|
||||||
'owner': 'airflow',
|
|
||||||
'depends_on_past': False,
|
|
||||||
'email_on_failure': False,
|
|
||||||
'email_on_retry': False,
|
|
||||||
'retries': 0, # No retries for manual list operation
|
|
||||||
'start_date': days_ago(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
with DAG(
|
|
||||||
dag_id='ytdlp_mgmt_queue_list_contents',
|
|
||||||
default_args=default_args,
|
|
||||||
schedule_interval=None, # Manually triggered
|
|
||||||
catchup=False,
|
|
||||||
description='Manually list the contents of a specific YTDLP Redis queue/key (list or hash).',
|
|
||||||
tags=['ytdlp', 'queue', 'management', 'redis', 'manual', 'list'],
|
|
||||||
params={
|
|
||||||
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
|
|
||||||
'queue_to_list': Param(
|
|
||||||
DEFAULT_QUEUE_TO_LIST,
|
|
||||||
type="string",
|
|
||||||
description="Exact name of the Redis key (list/hash) to list contents for (e.g., 'video_queue_inbox_account_xyz', 'video_queue_progress', etc.)."
|
|
||||||
),
|
|
||||||
'max_items': Param(DEFAULT_MAX_ITEMS, type="integer", description="Maximum number of items/fields to list. For lists, shows the most recent items."),
|
|
||||||
}
|
|
||||||
) as dag:
|
|
||||||
|
|
||||||
list_contents_task = PythonOperator(
|
|
||||||
task_id='list_specified_queue_contents',
|
|
||||||
python_callable=list_contents_callable,
|
|
||||||
# Params are implicitly passed via context['params']
|
|
||||||
)
|
|
||||||
list_contents_task.doc_md = """
|
|
||||||
### List Specified Queue/Key Contents Task
|
|
||||||
Lists the contents of the Redis key specified by `queue_to_list`.
|
|
||||||
- For **Lists** (e.g., `_inbox`), shows the first `max_items`.
|
|
||||||
- For **Hashes** (e.g., `_progress`, `_result`, `_fail`), shows up to `max_items` key-value pairs. Attempts to pretty-print JSON values.
|
|
||||||
- Logs a warning for very large hashes.
|
|
||||||
|
|
||||||
*Trigger this task manually via the UI.*
|
|
||||||
"""
|
|
||||||
493
dags/ytdlp_mgmt_queues.py
Normal file
493
dags/ytdlp_mgmt_queues.py
Normal file
@ -0,0 +1,493 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Airflow DAG for manually adding YouTube URLs or Video IDs to a Redis queue.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import List, Optional
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from airflow.exceptions import AirflowException
|
||||||
|
from airflow.models.dag import DAG
|
||||||
|
from airflow.models.param import Param
|
||||||
|
from airflow.operators.python import PythonOperator, BranchPythonOperator
|
||||||
|
from airflow.operators.empty import EmptyOperator
|
||||||
|
from airflow.providers.redis.hooks.redis import RedisHook
|
||||||
|
from airflow.utils.dates import days_ago
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Default settings
|
||||||
|
DEFAULT_REDIS_CONN_ID = "redis_default"
|
||||||
|
DEFAULT_QUEUE_NAME = "video_queue"
|
||||||
|
DEFAULT_QUEUE_TO_CLEAR = 'PLEASE_SPECIFY_QUEUE_TO_CLEAR'
|
||||||
|
|
||||||
|
|
||||||
|
# --- Helper Functions ---
|
||||||
|
|
||||||
|
def _get_redis_client(redis_conn_id: str):
|
||||||
|
"""Gets a Redis client from an Airflow connection."""
|
||||||
|
try:
|
||||||
|
redis_hook = RedisHook(redis_conn_id=redis_conn_id)
|
||||||
|
return redis_hook.get_conn()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to connect to Redis using connection '{redis_conn_id}': {e}")
|
||||||
|
raise AirflowException(f"Redis connection failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_video_inputs(input_str: str) -> List[str]:
|
||||||
|
"""Parses a flexible string of video inputs into a list of individual items."""
|
||||||
|
if not input_str or not isinstance(input_str, str):
|
||||||
|
return []
|
||||||
|
|
||||||
|
input_str = input_str.strip()
|
||||||
|
|
||||||
|
# 1. Try to parse as a JSON array
|
||||||
|
if input_str.startswith("[") and input_str.endswith("]"):
|
||||||
|
try:
|
||||||
|
items = json.loads(input_str)
|
||||||
|
if isinstance(items, list):
|
||||||
|
logger.info("Successfully parsed input as a JSON array.")
|
||||||
|
return [str(item).strip() for item in items]
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.warning("Input looked like a JSON array but failed to parse. Treating as a comma-separated string.")
|
||||||
|
|
||||||
|
# 2. Treat as a comma-separated string
|
||||||
|
items = [item.strip() for item in input_str.split(",")]
|
||||||
|
|
||||||
|
# 3. Clean up quotes and extra whitespace from each item
|
||||||
|
cleaned_items = []
|
||||||
|
for item in items:
|
||||||
|
if item.startswith(('"', "'")) and item.endswith(('"', "'")):
|
||||||
|
item = item[1:-1]
|
||||||
|
if item: # Only add non-empty items
|
||||||
|
cleaned_items.append(item.strip())
|
||||||
|
|
||||||
|
return cleaned_items
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_to_url(item: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Validates if an item is a recognizable YouTube URL or video ID,
|
||||||
|
and normalizes it to a standard watch URL format.
|
||||||
|
"""
|
||||||
|
if not item:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Regex for a standard 11-character YouTube video ID
|
||||||
|
video_id_pattern = r"^[a-zA-Z0-9_-]{11}$"
|
||||||
|
|
||||||
|
# Check if the item itself is a video ID
|
||||||
|
if re.match(video_id_pattern, item):
|
||||||
|
video_id = item
|
||||||
|
return f"https://www.youtube.com/watch?v={video_id}"
|
||||||
|
|
||||||
|
# Comprehensive regex to extract video ID from various URL formats
|
||||||
|
# Covers: watch, youtu.be, shorts, embed, /v/
|
||||||
|
url_patterns = [
|
||||||
|
r"(?:v=|\/v\/|youtu\.be\/|embed\/|shorts\/)([a-zA-Z0-9_-]{11})"
|
||||||
|
]
|
||||||
|
for pattern in url_patterns:
|
||||||
|
match = re.search(pattern, item)
|
||||||
|
if match:
|
||||||
|
video_id = match.group(1)
|
||||||
|
return f"https://www.youtube.com/watch?v={video_id}"
|
||||||
|
|
||||||
|
logger.warning(f"Could not recognize '{item}' as a valid YouTube URL or video ID.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def dump_redis_data_to_csv(redis_client, dump_dir, patterns):
|
||||||
|
"""Dumps data from Redis keys matching patterns to separate CSV files in a timestamped directory."""
|
||||||
|
timestamp_dir = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||||
|
full_dump_path = os.path.join(dump_dir, timestamp_dir)
|
||||||
|
|
||||||
|
os.makedirs(full_dump_path, exist_ok=True)
|
||||||
|
logger.info(f"Created dump directory: {full_dump_path}")
|
||||||
|
|
||||||
|
for pattern in patterns:
|
||||||
|
if not pattern: continue
|
||||||
|
|
||||||
|
# Sanitize pattern for filename
|
||||||
|
sanitized_pattern = re.sub(r'[^a-zA-Z0-9_-]', '_', pattern)
|
||||||
|
timestamp_file = datetime.now().strftime('%Y%m%d')
|
||||||
|
dump_file_name = f'redis_dump_{sanitized_pattern}_{timestamp_file}.csv'
|
||||||
|
dump_file_path = os.path.join(full_dump_path, dump_file_name)
|
||||||
|
|
||||||
|
logger.info(f"Dumping keys matching '{pattern}' to {dump_file_path}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(dump_file_path, 'w', newline='', encoding='utf-8') as csvfile:
|
||||||
|
writer = csv.writer(csvfile)
|
||||||
|
writer.writerow(['key', 'type', 'field_or_index', 'value'])
|
||||||
|
|
||||||
|
keys_found = 0
|
||||||
|
for key_bytes in redis_client.scan_iter(pattern):
|
||||||
|
key = key_bytes.decode('utf-8')
|
||||||
|
keys_found += 1
|
||||||
|
key_type = redis_client.type(key).decode('utf-8')
|
||||||
|
|
||||||
|
if key_type == 'hash':
|
||||||
|
for field, value in redis_client.hgetall(key).items():
|
||||||
|
writer.writerow([key, key_type, field.decode('utf-8'), value.decode('utf-8')])
|
||||||
|
elif key_type == 'list':
|
||||||
|
for index, value in enumerate(redis_client.lrange(key, 0, -1)):
|
||||||
|
writer.writerow([key, key_type, index, value.decode('utf-8')])
|
||||||
|
elif key_type == 'set':
|
||||||
|
for member in redis_client.smembers(key):
|
||||||
|
writer.writerow([key, key_type, None, member.decode('utf-8')])
|
||||||
|
elif key_type == 'string':
|
||||||
|
value = redis_client.get(key)
|
||||||
|
if value:
|
||||||
|
writer.writerow([key, key_type, None, value.decode('utf-8')])
|
||||||
|
|
||||||
|
if keys_found > 0:
|
||||||
|
logger.info(f"Successfully dumped {keys_found} keys for pattern '{pattern}' to {dump_file_path}")
|
||||||
|
else:
|
||||||
|
logger.info(f"No keys found for pattern '{pattern}'. Empty CSV file created at {dump_file_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to dump Redis data for pattern '{pattern}': {e}", exc_info=True)
|
||||||
|
raise AirflowException(f"Failed to dump Redis data for pattern '{pattern}': {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def clear_queue_callable(**context):
|
||||||
|
"""Dumps Redis data to CSV and/or clears a specified Redis key."""
|
||||||
|
params = context['params']
|
||||||
|
redis_conn_id = params['redis_conn_id']
|
||||||
|
queue_to_clear = params['queue_to_clear']
|
||||||
|
dump_queues = params['dump_queues']
|
||||||
|
# Get the rendered dump_dir from the templates_dict passed to the operator
|
||||||
|
dump_dir = context['templates_dict']['dump_dir']
|
||||||
|
dump_patterns = params['dump_patterns'].split(',') if params.get('dump_patterns') else []
|
||||||
|
|
||||||
|
redis_client = _get_redis_client(redis_conn_id)
|
||||||
|
|
||||||
|
if dump_queues and dump_patterns:
|
||||||
|
dump_redis_data_to_csv(redis_client, dump_dir, dump_patterns)
|
||||||
|
|
||||||
|
if not queue_to_clear or queue_to_clear == DEFAULT_QUEUE_TO_CLEAR:
|
||||||
|
logger.info("Parameter 'queue_to_clear' is not specified or is the default placeholder. Skipping key deletion.")
|
||||||
|
# If we only wanted to dump, this is a success.
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(f"Attempting to clear Redis key '{queue_to_clear}' using connection '{redis_conn_id}'.")
|
||||||
|
try:
|
||||||
|
deleted_count = redis_client.delete(queue_to_clear)
|
||||||
|
if deleted_count > 0:
|
||||||
|
logger.info(f"Successfully cleared Redis key '{queue_to_clear}'.")
|
||||||
|
else:
|
||||||
|
logger.info(f"Redis key '{queue_to_clear}' did not exist or was already empty.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to clear Redis key '{queue_to_clear}': {e}", exc_info=True)
|
||||||
|
raise AirflowException(f"Failed to clear Redis key: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def list_contents_callable(**context):
|
||||||
|
"""Lists the contents of the specified Redis key (list or hash)."""
|
||||||
|
params = context['params']
|
||||||
|
redis_conn_id = params['redis_conn_id']
|
||||||
|
queue_to_list = params['queue_to_list']
|
||||||
|
max_items = params.get('max_items', 10)
|
||||||
|
|
||||||
|
if not queue_to_list:
|
||||||
|
raise ValueError("Parameter 'queue_to_list' cannot be empty.")
|
||||||
|
|
||||||
|
logger.info(f"Attempting to list contents of Redis key '{queue_to_list}' (max: {max_items}) using connection '{redis_conn_id}'.")
|
||||||
|
try:
|
||||||
|
redis_client = _get_redis_client(redis_conn_id)
|
||||||
|
key_type_bytes = redis_client.type(queue_to_list)
|
||||||
|
key_type = key_type_bytes.decode('utf-8') # Decode type
|
||||||
|
|
||||||
|
if key_type == 'list':
|
||||||
|
list_length = redis_client.llen(queue_to_list)
|
||||||
|
# Get the last N items, which are the most recently added with rpush
|
||||||
|
items_to_fetch = min(max_items, list_length)
|
||||||
|
# lrange with negative indices gets items from the end of the list.
|
||||||
|
# -N to -1 gets the last N items.
|
||||||
|
contents_bytes = redis_client.lrange(queue_to_list, -items_to_fetch, -1)
|
||||||
|
contents = [item.decode('utf-8') for item in contents_bytes]
|
||||||
|
# Reverse the list so the absolute most recent item is printed first
|
||||||
|
contents.reverse()
|
||||||
|
logger.info(f"--- Contents of Redis List '{queue_to_list}' (showing most recent {len(contents)} of {list_length}) ---")
|
||||||
|
for i, item in enumerate(contents):
|
||||||
|
# The index here is just for display, 0 is the most recent
|
||||||
|
logger.info(f" [recent_{i}]: {item}")
|
||||||
|
if list_length > len(contents):
|
||||||
|
logger.info(f" ... ({list_length - len(contents)} older items not shown)")
|
||||||
|
logger.info(f"--- End of List Contents ---")
|
||||||
|
|
||||||
|
elif key_type == 'hash':
|
||||||
|
hash_size = redis_client.hlen(queue_to_list)
|
||||||
|
# HGETALL can be risky for large hashes. Consider HSCAN for production.
|
||||||
|
# For manual inspection, HGETALL is often acceptable.
|
||||||
|
if hash_size > max_items * 2: # Heuristic: avoid huge HGETALL
|
||||||
|
logger.warning(f"Hash '{queue_to_list}' has {hash_size} fields, which is large. Listing might be slow or incomplete. Consider using redis-cli HSCAN.")
|
||||||
|
# hgetall returns dict of bytes keys and bytes values, decode them
|
||||||
|
contents_bytes = redis_client.hgetall(queue_to_list)
|
||||||
|
contents = {k.decode('utf-8'): v.decode('utf-8') for k, v in contents_bytes.items()}
|
||||||
|
logger.info(f"--- Contents of Redis Hash '{queue_to_list}' ({len(contents)} fields) ---")
|
||||||
|
item_count = 0
|
||||||
|
for key, value in contents.items(): # key and value are now strings
|
||||||
|
if item_count >= max_items:
|
||||||
|
logger.info(f" ... (stopped listing after {max_items} items of {hash_size})")
|
||||||
|
break
|
||||||
|
# Attempt to pretty-print if value is JSON
|
||||||
|
try:
|
||||||
|
parsed_value = json.loads(value)
|
||||||
|
pretty_value = json.dumps(parsed_value, indent=2)
|
||||||
|
logger.info(f" '{key}':\n{pretty_value}")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.info(f" '{key}': {value}") # Print as string if not JSON
|
||||||
|
item_count += 1
|
||||||
|
logger.info(f"--- End of Hash Contents ---")
|
||||||
|
|
||||||
|
elif key_type == 'none':
|
||||||
|
logger.info(f"Redis key '{queue_to_list}' does not exist.")
|
||||||
|
else:
|
||||||
|
logger.info(f"Redis key '{queue_to_list}' is of type '{key_type}'. Listing contents for this type is not implemented.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to list contents of Redis key '{queue_to_list}': {e}", exc_info=True)
|
||||||
|
raise AirflowException(f"Failed to list Redis key contents: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def check_status_callable(**context):
|
||||||
|
"""Checks the status (type and size) of all standard Redis queues for a given base name."""
|
||||||
|
params = context['params']
|
||||||
|
redis_conn_id = params['redis_conn_id']
|
||||||
|
queue_name = params.get('queue_name_for_status', DEFAULT_QUEUE_NAME)
|
||||||
|
queue_suffixes = ['_inbox', '_progress', '_result', '_fail']
|
||||||
|
|
||||||
|
logger.info(f"--- Checking Status for Queues with Base Name: '{queue_name}' ---")
|
||||||
|
|
||||||
|
try:
|
||||||
|
redis_client = _get_redis_client(redis_conn_id)
|
||||||
|
|
||||||
|
for suffix in queue_suffixes:
|
||||||
|
queue_to_check = f"{queue_name}{suffix}"
|
||||||
|
key_type = redis_client.type(queue_to_check).decode('utf-8')
|
||||||
|
size = 0
|
||||||
|
if key_type == 'list':
|
||||||
|
size = redis_client.llen(queue_to_check)
|
||||||
|
elif key_type == 'hash':
|
||||||
|
size = redis_client.hlen(queue_to_check)
|
||||||
|
|
||||||
|
if key_type != 'none':
|
||||||
|
logger.info(f" - Queue '{queue_to_check}': Type='{key_type.upper()}', Size={size}")
|
||||||
|
else:
|
||||||
|
logger.info(f" - Queue '{queue_to_check}': Does not exist.")
|
||||||
|
|
||||||
|
logger.info(f"--- End of Status Check ---")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to check queue status for base name '{queue_name}': {e}", exc_info=True)
|
||||||
|
raise AirflowException(f"Failed to check queue status: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def add_videos_to_queue_callable(**context):
|
||||||
|
"""
|
||||||
|
Parses video inputs, normalizes them to URLs, and adds them to a Redis queue.
|
||||||
|
"""
|
||||||
|
params = context["params"]
|
||||||
|
video_inputs = params["video_inputs"]
|
||||||
|
queue_name = params["queue_name"]
|
||||||
|
redis_conn_id = params["redis_conn_id"]
|
||||||
|
dry_run = params["dry_run"]
|
||||||
|
|
||||||
|
if not video_inputs:
|
||||||
|
logger.info("No video inputs provided. Nothing to do.")
|
||||||
|
print("No video inputs provided. Nothing to do.")
|
||||||
|
return
|
||||||
|
|
||||||
|
raw_items = parse_video_inputs(video_inputs)
|
||||||
|
if not raw_items:
|
||||||
|
logger.info("Input string was empty or contained no items after parsing.")
|
||||||
|
print("Input string was empty or contained no items after parsing.")
|
||||||
|
return
|
||||||
|
|
||||||
|
valid_urls = []
|
||||||
|
for item in raw_items:
|
||||||
|
url = normalize_to_url(item)
|
||||||
|
if url and url not in valid_urls:
|
||||||
|
valid_urls.append(url)
|
||||||
|
elif not url:
|
||||||
|
logger.warning(f"Skipping invalid input item: '{item}'")
|
||||||
|
|
||||||
|
if not valid_urls:
|
||||||
|
raise AirflowException("No valid YouTube URLs or IDs were found in the provided input.")
|
||||||
|
|
||||||
|
logger.info(f"Found {len(valid_urls)} valid and unique URLs to add to the queue:")
|
||||||
|
print(f"Found {len(valid_urls)} valid and unique URLs to add to the queue:")
|
||||||
|
for url in valid_urls:
|
||||||
|
logger.info(f" - {url}")
|
||||||
|
print(f" - {url}")
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
logger.info("Dry run is enabled. Skipping Redis operation.")
|
||||||
|
print(f"\n[DRY RUN] Would have added {len(valid_urls)} URLs to the Redis list '{queue_name}_inbox'.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# --- Add to Redis ---
|
||||||
|
try:
|
||||||
|
redis_client = _get_redis_client(redis_conn_id)
|
||||||
|
inbox_queue = f"{queue_name}_inbox"
|
||||||
|
|
||||||
|
# Use a pipeline for atomic and efficient addition
|
||||||
|
with redis_client.pipeline() as pipe:
|
||||||
|
for url in valid_urls:
|
||||||
|
pipe.rpush(inbox_queue, url)
|
||||||
|
pipe.execute()
|
||||||
|
|
||||||
|
final_list_length = redis_client.llen(inbox_queue)
|
||||||
|
|
||||||
|
success_message = (
|
||||||
|
f"Successfully added {len(valid_urls)} URLs to Redis list '{inbox_queue}'. "
|
||||||
|
f"The list now contains {final_list_length} items."
|
||||||
|
)
|
||||||
|
logger.info(success_message)
|
||||||
|
print(f"\n{success_message}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to add URLs to Redis queue '{inbox_queue}': {e}", exc_info=True)
|
||||||
|
raise AirflowException(f"Failed to add URLs to Redis: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# --- DAG Definition ---
|
||||||
|
with DAG(
|
||||||
|
dag_id="ytdlp_mgmt_queues",
|
||||||
|
default_args={
|
||||||
|
"owner": "airflow",
|
||||||
|
"start_date": days_ago(1),
|
||||||
|
"retries": 0,
|
||||||
|
},
|
||||||
|
schedule=None,
|
||||||
|
catchup=False,
|
||||||
|
tags=["ytdlp", "queue", "management", "redis", "manual"],
|
||||||
|
doc_md="""
|
||||||
|
### YT-DLP Queue Management
|
||||||
|
|
||||||
|
This DAG provides a set of tools to manage Redis queues used by the YTDLP processing pipeline.
|
||||||
|
Select an `action` to perform when triggering the DAG.
|
||||||
|
|
||||||
|
**Actions:**
|
||||||
|
- `add_videos`: Add one or more YouTube videos to a queue.
|
||||||
|
- `clear_queue`: Dump and/or delete a specific Redis key.
|
||||||
|
- `list_contents`: View the contents of a Redis key (list or hash).
|
||||||
|
- `check_status`: (Placeholder) Check the overall status of the queues.
|
||||||
|
""",
|
||||||
|
params={
|
||||||
|
"action": Param(
|
||||||
|
"add_videos",
|
||||||
|
type="string",
|
||||||
|
enum=["add_videos", "clear_queue", "list_contents", "check_status"],
|
||||||
|
title="Action",
|
||||||
|
description="The management action to perform.",
|
||||||
|
),
|
||||||
|
# --- Params for 'add_videos' ---
|
||||||
|
"video_inputs": Param(
|
||||||
|
None,
|
||||||
|
type=["null", "string"],
|
||||||
|
title="[add_videos] Video URLs or IDs",
|
||||||
|
description="A single item, comma-separated list, or JSON array of YouTube URLs or Video IDs.",
|
||||||
|
),
|
||||||
|
"queue_name": Param(
|
||||||
|
DEFAULT_QUEUE_NAME,
|
||||||
|
type="string",
|
||||||
|
title="[add_videos] Queue Name",
|
||||||
|
description="The base name of the Redis queue to add videos to (e.g., 'video_queue').",
|
||||||
|
),
|
||||||
|
"dry_run": Param(
|
||||||
|
False,
|
||||||
|
type="boolean",
|
||||||
|
title="[add_videos] Dry Run",
|
||||||
|
description="If True, validate inputs without adding them to the queue.",
|
||||||
|
),
|
||||||
|
# --- Params for 'clear_queue' ---
|
||||||
|
"queue_to_clear": Param(
|
||||||
|
DEFAULT_QUEUE_TO_CLEAR,
|
||||||
|
type="string",
|
||||||
|
title="[clear_queue] Queue to Clear",
|
||||||
|
description="Exact name of the Redis key to delete.",
|
||||||
|
),
|
||||||
|
"dump_queues": Param(
|
||||||
|
True,
|
||||||
|
type="boolean",
|
||||||
|
title="[clear_queue] Dump Data",
|
||||||
|
description="If True, dump data before clearing.",
|
||||||
|
),
|
||||||
|
"dump_dir": Param(
|
||||||
|
"{{ var.value.get('YTDLP_REDIS_DUMP_DIR', '/opt/airflow/dumps') }}",
|
||||||
|
type="string",
|
||||||
|
title="[clear_queue] Dump Directory",
|
||||||
|
description="Base directory to save CSV dump files.",
|
||||||
|
),
|
||||||
|
"dump_patterns": Param(
|
||||||
|
'ytdlp:*,video_queue_*',
|
||||||
|
type="string",
|
||||||
|
title="[clear_queue] Dump Patterns",
|
||||||
|
description="Comma-separated list of key patterns to dump.",
|
||||||
|
),
|
||||||
|
# --- Params for 'list_contents' ---
|
||||||
|
"queue_to_list": Param(
|
||||||
|
'video_queue_inbox',
|
||||||
|
type="string",
|
||||||
|
title="[list_contents] Queue to List",
|
||||||
|
description="Exact name of the Redis key to list.",
|
||||||
|
),
|
||||||
|
"max_items": Param(
|
||||||
|
10,
|
||||||
|
type="integer",
|
||||||
|
title="[list_contents] Max Items to List",
|
||||||
|
description="Maximum number of items to show.",
|
||||||
|
),
|
||||||
|
# --- Params for 'check_status' ---
|
||||||
|
"queue_name_for_status": Param(
|
||||||
|
DEFAULT_QUEUE_NAME,
|
||||||
|
type="string",
|
||||||
|
title="[check_status] Base Queue Name",
|
||||||
|
description="Base name of the queues to check (e.g., 'video_queue').",
|
||||||
|
),
|
||||||
|
# --- Common Params ---
|
||||||
|
"redis_conn_id": Param(
|
||||||
|
DEFAULT_REDIS_CONN_ID,
|
||||||
|
type="string",
|
||||||
|
title="Redis Connection ID",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
) as dag:
|
||||||
|
branch_on_action = BranchPythonOperator(
|
||||||
|
task_id="branch_on_action",
|
||||||
|
python_callable=lambda **context: f"action_{context['params']['action']}",
|
||||||
|
)
|
||||||
|
|
||||||
|
action_add_videos = PythonOperator(
|
||||||
|
task_id="action_add_videos",
|
||||||
|
python_callable=add_videos_to_queue_callable,
|
||||||
|
)
|
||||||
|
|
||||||
|
action_clear_queue = PythonOperator(
|
||||||
|
task_id="action_clear_queue",
|
||||||
|
python_callable=clear_queue_callable,
|
||||||
|
templates_dict={'dump_dir': "{{ params.dump_dir }}"},
|
||||||
|
)
|
||||||
|
|
||||||
|
action_list_contents = PythonOperator(
|
||||||
|
task_id="action_list_contents",
|
||||||
|
python_callable=list_contents_callable,
|
||||||
|
)
|
||||||
|
|
||||||
|
action_check_status = PythonOperator(
|
||||||
|
task_id="action_check_status",
|
||||||
|
python_callable=check_status_callable,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Placeholder Tasks ---
|
||||||
|
branch_on_action >> [action_add_videos, action_clear_queue, action_list_contents, action_check_status]
|
||||||
@ -35,6 +35,19 @@ DEFAULT_MAX_URLS = '1' # Default number of URLs to process per run
|
|||||||
|
|
||||||
# --- Task Callables ---
|
# --- Task Callables ---
|
||||||
|
|
||||||
|
def select_account_callable(**context):
|
||||||
|
"""
|
||||||
|
Placeholder task for future logic to dynamically select an account.
|
||||||
|
For now, it just passes through the account_id from the DAG params.
|
||||||
|
"""
|
||||||
|
params = context['params']
|
||||||
|
account_id = params.get('account_id', 'default_account')
|
||||||
|
logger.info(f"Selected account for this run: {account_id}")
|
||||||
|
# This task could push the selected account_id to XComs in the future.
|
||||||
|
# For now, the next task will just read it from params.
|
||||||
|
return account_id
|
||||||
|
|
||||||
|
|
||||||
def log_trigger_info_callable(**context):
|
def log_trigger_info_callable(**context):
|
||||||
"""Logs information about how the DAG run was triggered."""
|
"""Logs information about how the DAG run was triggered."""
|
||||||
dag_run = context['dag_run']
|
dag_run = context['dag_run']
|
||||||
@ -113,7 +126,8 @@ def check_queue_for_urls_batch(**context):
|
|||||||
'queue_name': queue_name,
|
'queue_name': queue_name,
|
||||||
'redis_conn_id': redis_conn_id,
|
'redis_conn_id': redis_conn_id,
|
||||||
'max_urls_per_run': int(max_urls),
|
'max_urls_per_run': int(max_urls),
|
||||||
'stop_on_failure': params.get('stop_on_failure', True)
|
'stop_on_failure': params.get('stop_on_failure', True),
|
||||||
|
'account_id': params.get('account_id', 'default_account')
|
||||||
}
|
}
|
||||||
trigger_configs.append(worker_conf)
|
trigger_configs.append(worker_conf)
|
||||||
return trigger_configs
|
return trigger_configs
|
||||||
@ -140,18 +154,19 @@ default_args = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
with DAG(
|
with DAG(
|
||||||
dag_id='ytdlp_sensor_redis_queue',
|
dag_id='ytdlp_ops_sensor_queue',
|
||||||
default_args=default_args,
|
default_args=default_args,
|
||||||
schedule_interval='*/1 * * * *', # Runs every minute and can also be triggered.
|
schedule_interval=None, # Runs only on trigger, not on a schedule.
|
||||||
max_active_runs=1, # Prevent multiple sensors from running at once
|
max_active_runs=1, # Prevent multiple sensors from running at once
|
||||||
catchup=False,
|
catchup=False,
|
||||||
description='Polls Redis queue every minute (and on trigger) for URLs and starts worker DAGs.',
|
description='Polls Redis queue on trigger for URLs and starts worker DAGs.',
|
||||||
tags=['ytdlp', 'sensor', 'queue', 'redis', 'batch'],
|
tags=['ytdlp', 'sensor', 'queue', 'redis', 'batch'],
|
||||||
params={
|
params={
|
||||||
'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="Base name for Redis queues."),
|
'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="Base name for Redis queues."),
|
||||||
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
|
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
|
||||||
'max_urls_per_run': Param(DEFAULT_MAX_URLS, type="string", description="Maximum number of URLs to process in one batch."),
|
'max_urls_per_run': Param(DEFAULT_MAX_URLS, type="string", description="Maximum number of URLs to process in one batch."),
|
||||||
'stop_on_failure': Param(True, type="boolean", description="If True, a worker failure will stop the entire processing loop."),
|
'stop_on_failure': Param(True, type="boolean", description="If True, a worker failure will stop the entire processing loop."),
|
||||||
|
'account_id': Param('default_account', type="string", description="The account ID to use for processing the batch."),
|
||||||
}
|
}
|
||||||
) as dag:
|
) as dag:
|
||||||
|
|
||||||
@ -179,7 +194,7 @@ with DAG(
|
|||||||
# This operator will be dynamically expanded based on the output of poll_redis_task
|
# This operator will be dynamically expanded based on the output of poll_redis_task
|
||||||
trigger_worker_dags = TriggerDagRunOperator.partial(
|
trigger_worker_dags = TriggerDagRunOperator.partial(
|
||||||
task_id='trigger_worker_dags',
|
task_id='trigger_worker_dags',
|
||||||
trigger_dag_id='ytdlp_worker_per_url',
|
trigger_dag_id='ytdlp_ops_worker_per_url',
|
||||||
wait_for_completion=False, # Fire and forget
|
wait_for_completion=False, # Fire and forget
|
||||||
doc_md="""
|
doc_md="""
|
||||||
### Trigger Worker DAGs (Dynamically Mapped)
|
### Trigger Worker DAGs (Dynamically Mapped)
|
||||||
@ -191,4 +206,10 @@ This task is skipped if the polling task finds no URLs.
|
|||||||
conf=poll_redis_task.output
|
conf=poll_redis_task.output
|
||||||
)
|
)
|
||||||
|
|
||||||
log_trigger_info_task >> poll_redis_task >> trigger_worker_dags
|
select_account_task = PythonOperator(
|
||||||
|
task_id='select_account',
|
||||||
|
python_callable=select_account_callable,
|
||||||
|
)
|
||||||
|
select_account_task.doc_md = "### Select Account\n(Placeholder for future dynamic account selection logic)"
|
||||||
|
|
||||||
|
log_trigger_info_task >> select_account_task >> poll_redis_task >> trigger_worker_dags
|
||||||
@ -74,13 +74,51 @@ def _extract_video_id(url):
|
|||||||
|
|
||||||
# --- Queue Management Callables (for success/failure reporting) ---
|
# --- Queue Management Callables (for success/failure reporting) ---
|
||||||
|
|
||||||
def handle_success(**context):
|
def mark_proxy_banned_callable(**context):
|
||||||
|
"""Makes a Thrift call to ban a proxy if the get_token task failed with a bannable error."""
|
||||||
|
ti = context['task_instance']
|
||||||
|
proxy_to_ban = ti.xcom_pull(task_ids='get_token', key='proxy_to_ban')
|
||||||
|
|
||||||
|
if not proxy_to_ban:
|
||||||
|
logger.info("No proxy to ban was pushed to XCom. Skipping task.")
|
||||||
|
raise AirflowSkipException("No proxy to ban was identified in the upstream failure.")
|
||||||
|
|
||||||
|
server_identity = ti.xcom_pull(task_ids='get_token', key='server_identity_for_ban')
|
||||||
|
host = ti.xcom_pull(task_ids='get_token', key='service_host_for_ban')
|
||||||
|
port = ti.xcom_pull(task_ids='get_token', key='service_port_for_ban')
|
||||||
|
|
||||||
|
if not all([server_identity, host, port]):
|
||||||
|
logger.error("Missing connection details (identity, host, or port) from XCom. Cannot ban proxy.")
|
||||||
|
raise AirflowException("Missing connection details to ban proxy.")
|
||||||
|
|
||||||
|
logger.warning(f"Attempting to ban proxy '{proxy_to_ban}' for server '{server_identity}' at {host}:{port}.")
|
||||||
|
|
||||||
|
transport = None
|
||||||
|
try:
|
||||||
|
socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET)
|
||||||
|
socket_conn.setTimeout(15 * 1000) # 15s timeout for ban call
|
||||||
|
transport = TTransport.TFramedTransport(socket_conn)
|
||||||
|
protocol = TBinaryProtocol.TBinaryProtocol(transport)
|
||||||
|
client = YTTokenOpService.Client(protocol)
|
||||||
|
transport.open()
|
||||||
|
client.banProxy(proxyUrl=proxy_to_ban, serverIdentity=server_identity)
|
||||||
|
logger.info(f"Successfully sent request to ban proxy '{proxy_to_ban}'.")
|
||||||
|
except Exception as ban_exc:
|
||||||
|
logger.error(f"Failed to send ban request for proxy '{proxy_to_ban}': {ban_exc}", exc_info=True)
|
||||||
|
# We should fail the task if the ban call fails, as it's an important side-effect.
|
||||||
|
raise AirflowException(f"Failed to ban proxy: {ban_exc}")
|
||||||
|
finally:
|
||||||
|
if transport and transport.isOpen():
|
||||||
|
transport.close()
|
||||||
|
|
||||||
|
|
||||||
|
def mark_url_as_success(**context):
|
||||||
"""Moves URL from progress to result hash on success."""
|
"""Moves URL from progress to result hash on success."""
|
||||||
ti = context['task_instance']
|
ti = context['task_instance']
|
||||||
params = context['params']
|
params = context['params']
|
||||||
url = params.get('url') # Get URL from params, not XCom
|
url = params.get('url') # Get URL from params, not XCom
|
||||||
if not url:
|
if not url:
|
||||||
logger.warning("handle_success called but no URL found in DAG run parameters.")
|
logger.warning("mark_url_as_success called but no URL found in DAG run parameters.")
|
||||||
return
|
return
|
||||||
|
|
||||||
queue_name = params['queue_name']
|
queue_name = params['queue_name']
|
||||||
@ -91,7 +129,7 @@ def handle_success(**context):
|
|||||||
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
|
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
|
||||||
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
|
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
|
||||||
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
|
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
|
||||||
downloaded_file_path = ti.xcom_pull(task_ids='download_video')
|
downloaded_file_path = ti.xcom_pull(task_ids='download_and_probe')
|
||||||
|
|
||||||
logger.info(f"Handling success for URL: {url}")
|
logger.info(f"Handling success for URL: {url}")
|
||||||
logger.info(f" Downloaded File Path: {downloaded_file_path}")
|
logger.info(f" Downloaded File Path: {downloaded_file_path}")
|
||||||
@ -117,7 +155,8 @@ def handle_success(**context):
|
|||||||
logger.error(f"Error handling success in Redis for URL '{url}': {e}", exc_info=True)
|
logger.error(f"Error handling success in Redis for URL '{url}': {e}", exc_info=True)
|
||||||
# Log error but don't fail the task, as the main work succeeded.
|
# Log error but don't fail the task, as the main work succeeded.
|
||||||
|
|
||||||
def handle_failure(**context):
|
|
||||||
|
def mark_url_as_failed(**context):
|
||||||
"""
|
"""
|
||||||
Handles failed processing. Records detailed error information to the fail hash
|
Handles failed processing. Records detailed error information to the fail hash
|
||||||
and, if stop_on_failure is True, fails the task to make the DAG run failure visible.
|
and, if stop_on_failure is True, fails the task to make the DAG run failure visible.
|
||||||
@ -126,42 +165,72 @@ def handle_failure(**context):
|
|||||||
params = context['params']
|
params = context['params']
|
||||||
url = params.get('url') # Get URL from params
|
url = params.get('url') # Get URL from params
|
||||||
if not url:
|
if not url:
|
||||||
logger.error("handle_failure called but no URL found in DAG run parameters.")
|
logger.error("mark_url_as_failed called but no URL found in DAG run parameters.")
|
||||||
return
|
return
|
||||||
|
|
||||||
queue_name = params['queue_name']
|
queue_name = params['queue_name']
|
||||||
fail_queue = f"{queue_name}_fail"
|
fail_queue = f"{queue_name}_fail"
|
||||||
inbox_queue = f"{queue_name}_inbox"
|
inbox_queue = f"{queue_name}_inbox"
|
||||||
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
||||||
requeue_on_failure = params.get('requeue_on_failure', False)
|
|
||||||
stop_on_failure = params.get('stop_on_failure', True)
|
stop_on_failure = params.get('stop_on_failure', True)
|
||||||
|
|
||||||
|
# Determine if we should requeue based on various parameters
|
||||||
|
should_requeue = params.get('requeue_on_failure', False)
|
||||||
|
requeue_on_bannable_error = params.get('requeue_on_bannable_error', False)
|
||||||
|
requeue_on_ffprobe_failure = params.get('requeue_on_ffprobe_failure', False)
|
||||||
|
|
||||||
# --- Extract Detailed Error Information ---
|
# --- Extract Detailed Error Information ---
|
||||||
exception = context.get('exception')
|
exception = context.get('exception')
|
||||||
error_message = str(exception) if exception else "Unknown error"
|
|
||||||
error_type = type(exception).__name__ if exception else "Unknown"
|
|
||||||
tb_str = "".join(traceback.format_exception(etype=type(exception), value=exception, tb=exception.__traceback__)) if exception else "No traceback available."
|
|
||||||
|
|
||||||
# Find the specific task that failed
|
# Find the specific task that failed to pull its XComs
|
||||||
dag_run = context['dag_run']
|
dag_run = context['dag_run']
|
||||||
failed_task_id = "unknown"
|
failed_task_id = "unknown"
|
||||||
# Look at direct upstream tasks of the current task ('handle_failure')
|
upstream_tasks = ti.task.get_direct_relatives(upstream=True)
|
||||||
upstream_tasks = ti.get_direct_relatives(upstream=True)
|
|
||||||
for task in upstream_tasks:
|
for task in upstream_tasks:
|
||||||
upstream_ti = dag_run.get_task_instance(task_id=task.task_id)
|
upstream_ti = dag_run.get_task_instance(task_id=task.task_id)
|
||||||
if upstream_ti and upstream_ti.state == 'failed':
|
if upstream_ti and upstream_ti.state == 'failed':
|
||||||
failed_task_id = task.task_id
|
failed_task_id = task.task_id
|
||||||
break
|
break
|
||||||
|
|
||||||
|
error_details = None
|
||||||
|
if failed_task_id != "unknown":
|
||||||
|
error_details = ti.xcom_pull(task_ids=failed_task_id, key='error_details')
|
||||||
|
|
||||||
|
if error_details:
|
||||||
|
error_message = error_details.get('error_message', 'Unknown error from XCom')
|
||||||
|
error_type = error_details.get('error_type', 'Unknown type from XCom')
|
||||||
|
tb_str = error_details.get('traceback', 'No traceback in XCom.')
|
||||||
|
else:
|
||||||
|
error_message = str(exception) if exception else "Unknown error"
|
||||||
|
error_type = type(exception).__name__ if exception else "Unknown"
|
||||||
|
tb_str = "".join(traceback.format_exception(etype=type(exception), value=exception, tb=exception.__traceback__)) if exception else "No traceback available."
|
||||||
|
|
||||||
logger.info(f"Handling failure for URL: {url}")
|
logger.info(f"Handling failure for URL: {url}")
|
||||||
logger.error(f" Failed Task: {failed_task_id}")
|
logger.error(f" Failed Task: {failed_task_id}")
|
||||||
logger.error(f" Failure Type: {error_type}")
|
logger.error(f" Failure Type: {error_type}")
|
||||||
logger.error(f" Failure Reason: {error_message}")
|
logger.error(f" Failure Reason: {error_message}")
|
||||||
logger.debug(f" Traceback:\n{tb_str}")
|
logger.debug(f" Traceback:\n{tb_str}")
|
||||||
|
|
||||||
|
# --- Check for specific requeue conditions ---
|
||||||
|
if not should_requeue: # Only check specific conditions if the general one is false
|
||||||
|
if requeue_on_bannable_error and isinstance(exception, PBServiceException):
|
||||||
|
bannable_error_codes = [
|
||||||
|
"BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED",
|
||||||
|
"SOCKS5_CONNECTION_FAILED", "CLIENT_TIMEOUT", "GLOBAL_TIMEOUT"
|
||||||
|
]
|
||||||
|
if hasattr(exception, 'errorCode') and exception.errorCode in bannable_error_codes:
|
||||||
|
should_requeue = True
|
||||||
|
logger.info(f"Bannable error '{exception.errorCode}' detected. Re-queuing URL as per 'requeue_on_bannable_error' param.")
|
||||||
|
|
||||||
|
if requeue_on_ffprobe_failure and isinstance(exception, AirflowException) and "Bash command failed" in str(exception):
|
||||||
|
# Check for the specific exit code for probe failure
|
||||||
|
if "exit code 2" in str(exception):
|
||||||
|
should_requeue = True
|
||||||
|
logger.info("Probe failure detected (exit code 2). Re-queuing URL as per 'requeue_on_ffprobe_failure' param.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
client = _get_redis_client(redis_conn_id)
|
client = _get_redis_client(redis_conn_id)
|
||||||
if requeue_on_failure:
|
if should_requeue:
|
||||||
client.rpush(inbox_queue, url)
|
client.rpush(inbox_queue, url)
|
||||||
logger.info(f"Re-queued failed URL '{url}' to inbox '{inbox_queue}' for retry.")
|
logger.info(f"Re-queued failed URL '{url}' to inbox '{inbox_queue}' for retry.")
|
||||||
else:
|
else:
|
||||||
@ -190,8 +259,9 @@ def handle_failure(**context):
|
|||||||
if exception:
|
if exception:
|
||||||
raise exception
|
raise exception
|
||||||
else:
|
else:
|
||||||
# If for some reason there's no exception, fail explicitly.
|
# If we got details from XCom, we don't have the original exception object.
|
||||||
raise AirflowException("Failing task as per stop_on_failure=True, but original exception was not found.")
|
# So, we raise a new AirflowException with the details we have.
|
||||||
|
raise AirflowException(f"Failing task as per stop_on_failure=True. Upstream error: [{error_type}] {error_message}")
|
||||||
|
|
||||||
# --- YtdlpOpsOperator ---
|
# --- YtdlpOpsOperator ---
|
||||||
|
|
||||||
@ -232,12 +302,11 @@ class YtdlpOpsOperator(BaseOperator):
|
|||||||
transport = None
|
transport = None
|
||||||
ti = context['task_instance']
|
ti = context['task_instance']
|
||||||
|
|
||||||
try:
|
# Define connection parameters outside the try block to be available in except blocks
|
||||||
params = context['params']
|
params = context['params']
|
||||||
url = params.get('url')
|
url = params.get('url')
|
||||||
if not url:
|
if not url:
|
||||||
raise AirflowException("DAG was triggered without a 'url' in its configuration.")
|
raise AirflowException("DAG was triggered without a 'url' in its configuration.")
|
||||||
logger.info(f"Processing URL from DAG run config: {url}")
|
|
||||||
|
|
||||||
service_ip = self.render_template(self.service_ip, context)
|
service_ip = self.render_template(self.service_ip, context)
|
||||||
service_port_rendered = self.render_template(self.service_port, context)
|
service_port_rendered = self.render_template(self.service_port, context)
|
||||||
@ -248,6 +317,7 @@ class YtdlpOpsOperator(BaseOperator):
|
|||||||
host = params.get('service_ip', service_ip)
|
host = params.get('service_ip', service_ip)
|
||||||
port_str = params.get('service_port', service_port_rendered)
|
port_str = params.get('service_port', service_port_rendered)
|
||||||
account_id = params.get('account_id', account_id)
|
account_id = params.get('account_id', account_id)
|
||||||
|
clients = params.get('clients')
|
||||||
|
|
||||||
logger.info(f"Using direct connection settings: service_ip={host}, service_port={port_str}")
|
logger.info(f"Using direct connection settings: service_ip={host}, service_port={port_str}")
|
||||||
|
|
||||||
@ -264,6 +334,8 @@ class YtdlpOpsOperator(BaseOperator):
|
|||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
timeout = DEFAULT_TIMEOUT
|
timeout = DEFAULT_TIMEOUT
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"Processing URL from DAG run config: {url}")
|
||||||
socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET)
|
socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET)
|
||||||
socket_conn.setTimeout(timeout * 1000)
|
socket_conn.setTimeout(timeout * 1000)
|
||||||
transport = TTransport.TFramedTransport(socket_conn)
|
transport = TTransport.TFramedTransport(socket_conn)
|
||||||
@ -278,7 +350,8 @@ class YtdlpOpsOperator(BaseOperator):
|
|||||||
token_data = client.getOrRefreshToken(
|
token_data = client.getOrRefreshToken(
|
||||||
accountId=account_id,
|
accountId=account_id,
|
||||||
updateType=TokenUpdateMode.AUTO,
|
updateType=TokenUpdateMode.AUTO,
|
||||||
url=url
|
url=url,
|
||||||
|
clients=clients
|
||||||
)
|
)
|
||||||
logger.info("Successfully retrieved token data from service.")
|
logger.info("Successfully retrieved token data from service.")
|
||||||
|
|
||||||
@ -302,9 +375,54 @@ class YtdlpOpsOperator(BaseOperator):
|
|||||||
ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
|
ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
|
||||||
ti.xcom_push(key='ytdlp_command', value=ytdlp_cmd)
|
ti.xcom_push(key='ytdlp_command', value=ytdlp_cmd)
|
||||||
|
|
||||||
|
except (PBServiceException, TTransportException) as e:
|
||||||
|
# Enhanced logging to make failures clear in Airflow logs.
|
||||||
|
logger.error(f"Thrift call failed for URL '{url}' with account '{account_id}'.")
|
||||||
|
logger.error(f"Exception Type: {type(e).__name__}")
|
||||||
|
logger.error(f"Exception Message: {getattr(e, 'message', str(e))}")
|
||||||
|
if isinstance(e, PBServiceException):
|
||||||
|
logger.error(f"Service Error Code: {getattr(e, 'errorCode', 'N/A')}")
|
||||||
|
if hasattr(e, 'context') and e.context:
|
||||||
|
logger.error(f"Service Context: {e.context}")
|
||||||
|
|
||||||
|
# Use exc_info=True to get the full traceback in the logs
|
||||||
|
logger.error("Full exception traceback:", exc_info=True)
|
||||||
|
|
||||||
|
# Push exception details to XCom for the failure handler
|
||||||
|
error_details = {
|
||||||
|
'error_message': getattr(e, 'message', str(e)),
|
||||||
|
'error_type': type(e).__name__,
|
||||||
|
'traceback': traceback.format_exc()
|
||||||
|
}
|
||||||
|
ti.xcom_push(key='error_details', value=error_details)
|
||||||
|
|
||||||
|
proxy_to_ban = None
|
||||||
|
if isinstance(e, PBServiceException) and hasattr(e, 'context') and e.context:
|
||||||
|
# Assuming server adds 'proxy_url' to context on failure
|
||||||
|
proxy_to_ban = e.context.get('proxy_url')
|
||||||
|
bannable_error_codes = [
|
||||||
|
"BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED",
|
||||||
|
"SOCKS5_CONNECTION_FAILED", "CLIENT_TIMEOUT", "GLOBAL_TIMEOUT"
|
||||||
|
]
|
||||||
|
if e.errorCode not in bannable_error_codes:
|
||||||
|
proxy_to_ban = None
|
||||||
|
|
||||||
|
if proxy_to_ban:
|
||||||
|
logger.info(f"Found proxy to ban: {proxy_to_ban}. Pushing to XCom for 'mark_proxy_banned' task.")
|
||||||
|
ti.xcom_push(key='proxy_to_ban', value=proxy_to_ban)
|
||||||
|
ti.xcom_push(key='server_identity_for_ban', value=account_id)
|
||||||
|
ti.xcom_push(key='service_host_for_ban', value=host)
|
||||||
|
ti.xcom_push(key='service_port_for_ban', value=port)
|
||||||
|
else:
|
||||||
|
logger.info("No specific proxy to ban based on the error context.")
|
||||||
|
# Push None explicitly so the downstream task knows not to run
|
||||||
|
ti.xcom_push(key='proxy_to_ban', value=None)
|
||||||
|
|
||||||
|
# Re-raise the original exception to fail the Airflow task
|
||||||
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"YtdlpOpsOperator (Worker) failed: {e}", exc_info=True)
|
logger.error(f"YtdlpOpsOperator (Worker) failed with an unexpected exception: {e}", exc_info=True)
|
||||||
raise AirflowException(f"Task failed: {e}")
|
raise AirflowException(f"Task failed with unexpected error: {e}")
|
||||||
finally:
|
finally:
|
||||||
if transport and transport.isOpen():
|
if transport and transport.isOpen():
|
||||||
transport.close()
|
transport.close()
|
||||||
@ -344,13 +462,13 @@ default_args = {
|
|||||||
'depends_on_past': False,
|
'depends_on_past': False,
|
||||||
'email_on_failure': False,
|
'email_on_failure': False,
|
||||||
'email_on_retry': False,
|
'email_on_retry': False,
|
||||||
'retries': 1,
|
'retries': 0,
|
||||||
'retry_delay': timedelta(minutes=1),
|
'retry_delay': timedelta(minutes=1),
|
||||||
'start_date': days_ago(1),
|
'start_date': days_ago(1),
|
||||||
}
|
}
|
||||||
|
|
||||||
with DAG(
|
with DAG(
|
||||||
dag_id='ytdlp_worker_per_url',
|
dag_id='ytdlp_ops_worker_per_url',
|
||||||
default_args=default_args,
|
default_args=default_args,
|
||||||
schedule_interval=None,
|
schedule_interval=None,
|
||||||
catchup=False,
|
catchup=False,
|
||||||
@ -366,12 +484,16 @@ with DAG(
|
|||||||
'service_ip': Param('89.253.221.173', type="string", description="Service IP."),
|
'service_ip': Param('89.253.221.173', type="string", description="Service IP."),
|
||||||
'service_port': Param(9090, type="integer", description="Service port."),
|
'service_port': Param(9090, type="integer", description="Service port."),
|
||||||
'account_id': Param('default_account', type="string", description="Account ID for the API call."),
|
'account_id': Param('default_account', type="string", description="Account ID for the API call."),
|
||||||
|
'clients': Param('ios', type="string", description="Comma-separated list of clients to use for token generation (e.g., 'ios,android,mweb')."),
|
||||||
'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
|
'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
|
||||||
'download_format': Param('ba[ext=m4a]/bestaudio/best', type="string", description="yt-dlp format selection string."),
|
'download_format': Param('ba[ext=m4a]/bestaudio/best', type="string", description="yt-dlp format selection string."),
|
||||||
'output_path_template': Param("%(title)s [%(id)s].%(ext)s", type="string", description="yt-dlp output filename template."),
|
'output_path_template': Param("%(title)s [%(id)s].%(ext)s", type="string", description="yt-dlp output filename template."),
|
||||||
'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json."),
|
'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json."),
|
||||||
'requeue_on_failure': Param(False, type="boolean", description="If True, re-adds the URL to the inbox on failure instead of moving to the fail hash."),
|
'requeue_on_failure': Param(False, type="boolean", description="If True, re-adds the URL to the inbox on failure instead of moving to the fail hash."),
|
||||||
'stop_on_failure': Param(True, type="boolean", description="If True, a worker failure will stop the entire processing loop."),
|
'stop_on_failure': Param(True, type="boolean", description="If True, a worker failure will stop the entire processing loop."),
|
||||||
|
'retry_on_probe_failure': Param(False, type="boolean", description="If True, attempts to re-download and probe a file if the initial probe fails."),
|
||||||
|
'requeue_on_bannable_error': Param(False, type="boolean", description="If True, re-queues the URL if a bannable error (proxy, bot detection) occurs."),
|
||||||
|
'requeue_on_ffprobe_failure': Param(False, type="boolean", description="If True, re-queues the URL if the ffmpeg/ffprobe check fails."),
|
||||||
}
|
}
|
||||||
) as dag:
|
) as dag:
|
||||||
|
|
||||||
@ -382,12 +504,13 @@ with DAG(
|
|||||||
account_id="{{ params.account_id }}",
|
account_id="{{ params.account_id }}",
|
||||||
timeout="{{ params.timeout }}",
|
timeout="{{ params.timeout }}",
|
||||||
info_json_dir="{{ params.info_json_dir }}",
|
info_json_dir="{{ params.info_json_dir }}",
|
||||||
retries=0,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
download_video = BashOperator(
|
download_and_probe = BashOperator(
|
||||||
task_id='download_video',
|
task_id='download_and_probe',
|
||||||
bash_command="""
|
bash_command="""
|
||||||
|
set -e
|
||||||
|
|
||||||
INFO_JSON_PATH="{{ ti.xcom_pull(task_ids='get_token', key='info_json_path') }}"
|
INFO_JSON_PATH="{{ ti.xcom_pull(task_ids='get_token', key='info_json_path') }}"
|
||||||
PROXY="{{ ti.xcom_pull(task_ids='get_token', key='socks_proxy') }}"
|
PROXY="{{ ti.xcom_pull(task_ids='get_token', key='socks_proxy') }}"
|
||||||
FORMAT="{{ params.download_format }}"
|
FORMAT="{{ params.download_format }}"
|
||||||
@ -395,7 +518,7 @@ with DAG(
|
|||||||
FILENAME_TEMPLATE="{{ params.output_path_template }}"
|
FILENAME_TEMPLATE="{{ params.output_path_template }}"
|
||||||
FULL_OUTPUT_PATH="$DOWNLOAD_DIR/$FILENAME_TEMPLATE"
|
FULL_OUTPUT_PATH="$DOWNLOAD_DIR/$FILENAME_TEMPLATE"
|
||||||
|
|
||||||
echo "Starting download..."
|
echo "--- Starting Download Step ---"
|
||||||
echo "Info JSON Path: $INFO_JSON_PATH"
|
echo "Info JSON Path: $INFO_JSON_PATH"
|
||||||
echo "Proxy: $PROXY"
|
echo "Proxy: $PROXY"
|
||||||
echo "Format: $FORMAT"
|
echo "Format: $FORMAT"
|
||||||
@ -412,10 +535,9 @@ with DAG(
|
|||||||
CMD_ARRAY+=(--proxy "$PROXY")
|
CMD_ARRAY+=(--proxy "$PROXY")
|
||||||
fi
|
fi
|
||||||
CMD_ARRAY+=(-f "$FORMAT" -o "$FULL_OUTPUT_PATH" --print filename)
|
CMD_ARRAY+=(-f "$FORMAT" -o "$FULL_OUTPUT_PATH" --print filename)
|
||||||
CMD_ARRAY+=(--no-progress --no-simulate --no-write-info-json --ignore-errors --no-playlist)
|
CMD_ARRAY+=(--continue --no-progress --no-simulate --no-write-info-json --ignore-errors --no-playlist)
|
||||||
|
|
||||||
printf "Executing: %q " "${CMD_ARRAY[@]}"
|
echo "Executing: $(printf "%q " "${CMD_ARRAY[@]}")"
|
||||||
echo ""
|
|
||||||
|
|
||||||
FINAL_FILENAME=$("${CMD_ARRAY[@]}")
|
FINAL_FILENAME=$("${CMD_ARRAY[@]}")
|
||||||
EXIT_CODE=$?
|
EXIT_CODE=$?
|
||||||
@ -430,17 +552,64 @@ with DAG(
|
|||||||
echo "Error: Download failed or did not produce a file."
|
echo "Error: Download failed or did not produce a file."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "SUCCESS: Final file confirmed at: $FINAL_FILENAME"
|
echo "SUCCESS: Download complete. Final file at: $FINAL_FILENAME"
|
||||||
|
|
||||||
|
echo "--- Starting Probe Step ---"
|
||||||
|
echo "Probing downloaded file: $FINAL_FILENAME"
|
||||||
|
if ! ffmpeg -v error -i "$FINAL_FILENAME" -f null - ; then
|
||||||
|
echo "Error: ffmpeg probe check failed for '$FINAL_FILENAME'. The file might be corrupt."
|
||||||
|
|
||||||
|
if [ "{{ params.retry_on_probe_failure }}" == "True" ]; then
|
||||||
|
echo "Attempting one retry on probe failure..."
|
||||||
|
echo "Renaming to .part to attempt resuming download."
|
||||||
|
mv -f "$FINAL_FILENAME" "$FINAL_FILENAME.part"
|
||||||
|
|
||||||
|
# Re-run download command
|
||||||
|
echo "Re-executing: $(printf "%q " "${CMD_ARRAY[@]}")"
|
||||||
|
FINAL_FILENAME=$("${CMD_ARRAY[@]}")
|
||||||
|
EXIT_CODE=$?
|
||||||
|
echo "yt-dlp retry exited with code: $EXIT_CODE"
|
||||||
|
|
||||||
|
if [ $EXIT_CODE -ne 0 ]; then
|
||||||
|
echo "Error: yt-dlp retry command failed."
|
||||||
|
exit $EXIT_CODE
|
||||||
|
fi
|
||||||
|
if [ -z "$FINAL_FILENAME" ] || [ ! -f "$FINAL_FILENAME" ]; then
|
||||||
|
echo "Error: Retry download failed or did not produce a file."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "SUCCESS: Retry download complete. Final file at: $FINAL_FILENAME"
|
||||||
|
|
||||||
|
# Re-probe
|
||||||
|
echo "Probing redownloaded file: $FINAL_FILENAME"
|
||||||
|
if ! ffmpeg -v error -i "$FINAL_FILENAME" -f null - ; then
|
||||||
|
echo "Error: ffmpeg probe check failed again for '$FINAL_FILENAME'. Failing with exit code 2."
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Failing with exit code 2 due to probe failure (retries disabled)."
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
echo "SUCCESS: Probe confirmed valid media file."
|
||||||
|
|
||||||
|
# Push the final filename for the success_task
|
||||||
echo "$FINAL_FILENAME"
|
echo "$FINAL_FILENAME"
|
||||||
""",
|
""",
|
||||||
retries=3,
|
retries=0, # Retries are now handled inside the script based on a DAG param
|
||||||
retry_delay=timedelta(minutes=2),
|
retry_delay=timedelta(minutes=1),
|
||||||
|
)
|
||||||
|
|
||||||
|
mark_proxy_banned = PythonOperator(
|
||||||
|
task_id='mark_proxy_banned',
|
||||||
|
python_callable=mark_proxy_banned_callable,
|
||||||
|
trigger_rule='one_failed', # Run only if get_token fails
|
||||||
)
|
)
|
||||||
|
|
||||||
# This task triggers the sensor DAG to check for more work as soon as this worker is done.
|
# This task triggers the sensor DAG to check for more work as soon as this worker is done.
|
||||||
trigger_sensor_for_next_batch = TriggerDagRunOperator(
|
trigger_sensor_for_next_batch = TriggerDagRunOperator(
|
||||||
task_id='trigger_sensor_for_next_batch',
|
task_id='trigger_sensor_for_next_batch',
|
||||||
trigger_dag_id='ytdlp_sensor_redis_queue',
|
trigger_dag_id='ytdlp_ops_sensor_queue',
|
||||||
# Pass only the sensor's needed parameters back to it.
|
# Pass only the sensor's needed parameters back to it.
|
||||||
# These values were originally passed from the sensor to this worker.
|
# These values were originally passed from the sensor to this worker.
|
||||||
# The values are templated and will be passed as strings to the triggered DAG.
|
# The values are templated and will be passed as strings to the triggered DAG.
|
||||||
@ -462,25 +631,25 @@ with DAG(
|
|||||||
|
|
||||||
# Define success and failure handling tasks
|
# Define success and failure handling tasks
|
||||||
success_task = PythonOperator(
|
success_task = PythonOperator(
|
||||||
task_id='handle_success',
|
task_id='mark_url_as_success',
|
||||||
python_callable=handle_success,
|
python_callable=mark_url_as_success,
|
||||||
trigger_rule='all_success', # Run only if upstream tasks succeeded
|
trigger_rule='all_success', # Run only if upstream tasks succeeded
|
||||||
)
|
)
|
||||||
|
|
||||||
failure_task = PythonOperator(
|
failure_task = PythonOperator(
|
||||||
task_id='handle_failure',
|
task_id='mark_url_as_failed',
|
||||||
python_callable=handle_failure,
|
python_callable=mark_url_as_failed,
|
||||||
trigger_rule='one_failed', # Run if any upstream task failed
|
trigger_rule='one_failed', # Run if any upstream task failed
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- Define Task Dependencies ---
|
# --- Define Task Dependencies ---
|
||||||
|
|
||||||
# The main processing flow
|
# The main success flow
|
||||||
get_token >> download_video
|
get_token >> download_and_probe >> success_task >> trigger_sensor_for_next_batch
|
||||||
|
|
||||||
# The success path: if download_video succeeds, run success_task, then trigger the next sensor run.
|
# The failure path for get_token, which includes the explicit ban task
|
||||||
download_video >> success_task >> trigger_sensor_for_next_batch
|
get_token >> mark_proxy_banned
|
||||||
|
|
||||||
# The failure path: if get_token OR download_video fails, run the failure_task.
|
# The main failure handler, which listens to the primary tasks.
|
||||||
# This is a "fan-in" dependency.
|
# If get_token or download_and_probe fails, it will trigger failure_task.
|
||||||
[get_token, download_video] >> failure_task
|
[get_token, download_and_probe] >> failure_task
|
||||||
@ -1,966 +0,0 @@
|
|||||||
"""
|
|
||||||
DAG to deploy and manage YTDLP token service.
|
|
||||||
|
|
||||||
This DAG handles the deployment, monitoring, and cleanup of a YTDLP token service
|
|
||||||
for a given account. It supports both Redis-based service discovery and direct
|
|
||||||
connection via manually specified host and port.
|
|
||||||
|
|
||||||
Configuration Options:
|
|
||||||
- account_id: (Required) The account ID for which the service is being deployed.
|
|
||||||
- proxy: (Optional) The proxy to use for the service.
|
|
||||||
- redis_enabled: (Optional, default=True) Whether to use Redis for service discovery.
|
|
||||||
If False, you must provide `host` and `port` manually.
|
|
||||||
- host: (Optional) The host IP of the service. Required if `redis_enabled=False`.
|
|
||||||
- port: (Optional) The port of the service. Required if `redis_enabled=False`.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
1. Redis-based service discovery:
|
|
||||||
- Set `redis_enabled=True` (default).
|
|
||||||
- Ensure Redis is configured in Airflow connections.
|
|
||||||
- The DAG will automatically discover the service IP and port from Redis.
|
|
||||||
|
|
||||||
2. Manual host and port:
|
|
||||||
- Set `redis_enabled=False`.
|
|
||||||
- Provide `host` and `port` manually in the DAG configuration.
|
|
||||||
- Example: {"host": "192.168.1.100", "port": 9090}.
|
|
||||||
|
|
||||||
Example Trigger Configuration:
|
|
||||||
{
|
|
||||||
"account_id": "test_account",
|
|
||||||
"proxy": "socks5://proxy.example.com:1080",
|
|
||||||
"redis_enabled": False,
|
|
||||||
"host": "192.168.1.100",
|
|
||||||
"port": 9090
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
from airflow import DAG
|
|
||||||
from airflow.models.param import Param
|
|
||||||
from airflow.operators.empty import EmptyOperator
|
|
||||||
from airflow.operators.python import PythonOperator
|
|
||||||
# HttpSensor is no longer used
|
|
||||||
# from airflow.providers.http.sensors.http import HttpSensor
|
|
||||||
from airflow.utils.trigger_rule import TriggerRule
|
|
||||||
from airflow.hooks.base import BaseHook
|
|
||||||
from airflow.exceptions import AirflowException
|
|
||||||
from typing import Sequence # Add Sequence for type hinting
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
from airflow.utils.dates import days_ago # Add this import
|
|
||||||
import uuid
|
|
||||||
import os
|
|
||||||
import logging
|
|
||||||
import shutil
|
|
||||||
import docker
|
|
||||||
import uuid
|
|
||||||
import redis
|
|
||||||
import requests
|
|
||||||
import socket
|
|
||||||
import time
|
|
||||||
import sys # Import sys for maxsize
|
|
||||||
from airflow.configuration import conf # Import conf
|
|
||||||
|
|
||||||
# Import and apply Thrift exceptions patch
|
|
||||||
try:
|
|
||||||
# Always apply the patch, regardless of environment
|
|
||||||
from thrift_exceptions_patch import patch_thrift_exceptions
|
|
||||||
patch_thrift_exceptions()
|
|
||||||
logging.info("Applied Thrift exceptions patch for Airflow compatibility")
|
|
||||||
|
|
||||||
# Verify the patch was applied correctly
|
|
||||||
try:
|
|
||||||
from pangramia.yt.exceptions.ttypes import PBServiceException
|
|
||||||
test_exception = PBServiceException(message="Test")
|
|
||||||
# Try to modify attributes to verify patch works
|
|
||||||
test_exception.args = ("Test",)
|
|
||||||
test_exception.message = "Modified test"
|
|
||||||
logging.info("Verified Thrift exception patch is working correctly")
|
|
||||||
except Exception as verify_error:
|
|
||||||
logging.error(f"Thrift exception patch verification failed: {verify_error}")
|
|
||||||
logging.error("This may cause 'immutable instance' errors during error handling")
|
|
||||||
except ImportError as e:
|
|
||||||
logging.warning(f"Could not import thrift_exceptions_patch: {e}")
|
|
||||||
logging.warning("Airflow compatibility will be affected - expect 'immutable instance' errors")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error applying Thrift exceptions patch: {e}")
|
|
||||||
|
|
||||||
# Default arguments for the DAG
|
|
||||||
default_args = {
|
|
||||||
'owner': 'airflow',
|
|
||||||
'depends_on_past': False,
|
|
||||||
'email_on_failure': False,
|
|
||||||
'email_on_retry': False,
|
|
||||||
'retries': 0, # Disable retries for all tasks in this DAG
|
|
||||||
'retry_delay': timedelta(minutes=5),
|
|
||||||
# Removed 'queue': 'auth_queue' to use the default queue
|
|
||||||
# Optional: Further filter workers by tags if using CeleryExecutor
|
|
||||||
'executor_config': {"CeleryExecutor": {"tags": ["auth_node"]}},
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_redis_connection(redis_host=None, redis_port=None):
|
|
||||||
"""Get a Redis connection using Airflow's Redis connection or manually specified host/port."""
|
|
||||||
if redis_host and redis_port:
|
|
||||||
# Use manually specified host and port
|
|
||||||
return redis.Redis(
|
|
||||||
host=redis_host,
|
|
||||||
port=redis_port,
|
|
||||||
db=0,
|
|
||||||
decode_responses=True
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Use Airflow's Redis connection
|
|
||||||
redis_conn = BaseHook.get_connection("redis_default")
|
|
||||||
# Use the password from the connection if available, otherwise use 'airflow' as default
|
|
||||||
password = redis_conn.password or 'airflow'
|
|
||||||
return redis.Redis(
|
|
||||||
host=redis_conn.host, # 'redis' (service name in docker-compose)
|
|
||||||
port=redis_conn.port, # 6379
|
|
||||||
password=password,
|
|
||||||
db=0,
|
|
||||||
decode_responses=True
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_free_port():
|
|
||||||
"""Find and return a free port."""
|
|
||||||
import socket
|
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
||||||
s.bind(('0.0.0.0', 0))
|
|
||||||
return s.getsockname()[1]
|
|
||||||
|
|
||||||
def is_port_free(p):
|
|
||||||
"""Check if a port is free to use."""
|
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
||||||
try:
|
|
||||||
s.bind(('0.0.0.0', p))
|
|
||||||
return True
|
|
||||||
except OSError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def store_account_metadata(account_id, ip, port, proxy=None, health_port=None, container_id=None):
|
|
||||||
"""Store account metadata in Redis."""
|
|
||||||
redis_client = get_redis_connection()
|
|
||||||
try:
|
|
||||||
# Verify Redis connection
|
|
||||||
if not redis_client.ping():
|
|
||||||
raise ConnectionError("Failed to connect to Redis")
|
|
||||||
|
|
||||||
# Store main account metadata
|
|
||||||
mapping = {
|
|
||||||
"ip": ip,
|
|
||||||
"port": str(port),
|
|
||||||
"status": "running",
|
|
||||||
"start_time": str(time.time())
|
|
||||||
}
|
|
||||||
if proxy:
|
|
||||||
mapping["proxy"] = proxy
|
|
||||||
if health_port:
|
|
||||||
mapping["health_port"] = str(health_port)
|
|
||||||
if container_id:
|
|
||||||
mapping["container_id"] = container_id
|
|
||||||
|
|
||||||
# Use pipeline for atomic operations
|
|
||||||
with redis_client.pipeline() as pipe:
|
|
||||||
# Store main metadata
|
|
||||||
pipe.hset(f"ytdlp:{account_id}", mapping=mapping)
|
|
||||||
# Set expiration (1 week)
|
|
||||||
pipe.expire(f"ytdlp:{account_id}", 604800)
|
|
||||||
# Add to account list
|
|
||||||
pipe.sadd("ytdlp:accounts", account_id)
|
|
||||||
# Execute all commands
|
|
||||||
results = pipe.execute()
|
|
||||||
|
|
||||||
# Verify all commands succeeded
|
|
||||||
if not all(results):
|
|
||||||
raise RuntimeError(f"Failed to store metadata for {account_id}. Pipeline results: {results}")
|
|
||||||
|
|
||||||
# Verify the data was actually stored
|
|
||||||
stored_data = redis_client.hgetall(f"ytdlp:{account_id}")
|
|
||||||
if not stored_data:
|
|
||||||
raise RuntimeError(f"Failed to verify stored data for {account_id}")
|
|
||||||
|
|
||||||
logging.info(f"Successfully stored account metadata for {account_id} in Redis: {stored_data}")
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to store account metadata for {account_id}: {e}", exc_info=True)
|
|
||||||
# Attempt cleanup if storage failed
|
|
||||||
try:
|
|
||||||
redis_client = get_redis_connection() # Ensure client is available
|
|
||||||
redis_client.delete(f"ytdlp:{account_id}")
|
|
||||||
redis_client.srem("ytdlp:accounts", account_id)
|
|
||||||
except Exception as cleanup_error:
|
|
||||||
logging.error(f"Failed to cleanup failed storage for {account_id}: {cleanup_error}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Removed get_account_metadata function as the service now handles Redis registration checks.
|
|
||||||
|
|
||||||
def prepare_and_deploy_service(**context):
|
|
||||||
"""Prepare deployment and deploy the Docker service."""
|
|
||||||
# Retrieve account_id, proxy, clients, and other parameters from DAG run configuration (conf)
|
|
||||||
# Set default values for account_id, proxy, and redis_enabled
|
|
||||||
account_id = context['dag_run'].conf.get('account_id') or context['params'].get('account_id', 'account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09')
|
|
||||||
proxy = context['dag_run'].conf.get('proxy') or context['params'].get('proxy', 'socks5://sslocal-rust-1084:1084')
|
|
||||||
clients = context['dag_run'].conf.get('clients') or context['params'].get('clients', 'ios,android,mweb')
|
|
||||||
redis_enabled = context['dag_run'].conf.get('redis_enabled', False) # Default to False
|
|
||||||
host_param = context['dag_run'].conf.get('host') # Host parameter from config
|
|
||||||
port_param = context['dag_run'].conf.get('port') # Port parameter from config
|
|
||||||
docker_network = context['dag_run'].conf.get('docker_network') or context['params'].get('docker_network', 'airflow_prod_proxynet')
|
|
||||||
host_external_ip_env = os.getenv('HOST_EXTERNAL_IP') # Explicit external IP from environment
|
|
||||||
|
|
||||||
if not account_id:
|
|
||||||
raise ValueError("Account ID is missing.")
|
|
||||||
|
|
||||||
# --- Port Determination ---
|
|
||||||
# Assign a free port if not provided, or validate the provided one
|
|
||||||
if not port_param:
|
|
||||||
port = get_free_port()
|
|
||||||
if not is_port_free(port):
|
|
||||||
raise ValueError(f"Assigned port {port} is already in use")
|
|
||||||
logging.info(f"No port provided, assigned free port: {port}")
|
|
||||||
else:
|
|
||||||
port = int(port_param)
|
|
||||||
if not is_port_free(port):
|
|
||||||
raise ValueError(f"Provided port {port} is already in use")
|
|
||||||
logging.info(f"Using provided port: {port}")
|
|
||||||
|
|
||||||
# Determine health port
|
|
||||||
health_port = port + 1
|
|
||||||
if not is_port_free(health_port):
|
|
||||||
raise ValueError(f"Health port {health_port} (derived from port {port}) is already in use")
|
|
||||||
logging.info(f"Using health port: {health_port}")
|
|
||||||
|
|
||||||
|
|
||||||
# --- Host Determination ---
|
|
||||||
# host_for_registration: IP/Host for client discovery (Redis/Logs)
|
|
||||||
# host_for_sensor: Hostname/IP for Airflow HttpSensor health check
|
|
||||||
|
|
||||||
host_for_registration = host_param # Start with the parameter value
|
|
||||||
|
|
||||||
if redis_enabled:
|
|
||||||
# If Redis is enabled, registration host should ideally be externally reachable
|
|
||||||
if not host_for_registration:
|
|
||||||
host_for_registration = host_external_ip_env # Use external IP from env var if available
|
|
||||||
if not host_for_registration:
|
|
||||||
# If no env var, try fetching external IP using requests
|
|
||||||
try:
|
|
||||||
logging.info("HOST_EXTERNAL_IP not set. Attempting to fetch external IP from api.ipify.org...")
|
|
||||||
response = requests.get('https://api.ipify.org', timeout=10) # 10 second timeout
|
|
||||||
response.raise_for_status() # Raise exception for bad status codes
|
|
||||||
host_for_registration = response.text.strip()
|
|
||||||
if not host_for_registration: # Check if response was empty
|
|
||||||
raise ValueError("Received empty response from api.ipify.org")
|
|
||||||
logging.info(f"Successfully fetched external IP: {host_for_registration}")
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
logging.warning(f"Failed to fetch external IP: {e}. Falling back to Docker bridge IP.")
|
|
||||||
# Fallback to default Docker bridge IP if fetching fails
|
|
||||||
host_for_registration = "172.17.0.1"
|
|
||||||
logging.warning(f"Defaulting registration host to Docker bridge IP: {host_for_registration}. Ensure clients can reach this IP.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Unexpected error fetching external IP: {e}. Falling back to Docker bridge IP.")
|
|
||||||
host_for_registration = "172.17.0.1"
|
|
||||||
logging.warning(f"Defaulting registration host to Docker bridge IP: {host_for_registration}. Ensure clients can reach this IP.")
|
|
||||||
else:
|
|
||||||
logging.info(f"Redis enabled. Using HOST_EXTERNAL_IP environment variable for registration: {host_for_registration}")
|
|
||||||
else:
|
|
||||||
logging.info(f"Redis enabled. Using provided host parameter for registration: {host_for_registration}")
|
|
||||||
else: # Redis disabled
|
|
||||||
# If Redis is disabled, registration host defaults to 0.0.0.0 if not provided
|
|
||||||
if not host_for_registration:
|
|
||||||
host_for_registration = "0.0.0.0"
|
|
||||||
logging.warning(f"Redis disabled and no host param provided. Defaulting registration host to {host_for_registration}.")
|
|
||||||
else:
|
|
||||||
logging.info(f"Redis disabled. Using provided host parameter for registration: {host_for_registration}")
|
|
||||||
|
|
||||||
# host_for_sensor determination will happen *after* container creation, using container name.
|
|
||||||
|
|
||||||
logging.info(f"Preparing deployment for account {account_id}. Registration Host: {host_for_registration}, Port: {port}, Health Port: {health_port}")
|
|
||||||
|
|
||||||
# Generate unique work ID and context directory
|
|
||||||
work_id = str(uuid.uuid4())
|
|
||||||
context['task_instance'].xcom_push(key='work_id', value=work_id)
|
|
||||||
|
|
||||||
context_dir = os.path.join(os.getenv('AIRFLOW_HOME', '/tmp'), 'service-data', work_id, 'context-data')
|
|
||||||
os.makedirs(context_dir, exist_ok=True, mode=0o777)
|
|
||||||
os.chmod(context_dir, 0o777)
|
|
||||||
|
|
||||||
# Push context directory and account details to XCom
|
|
||||||
context['task_instance'].xcom_push(key='context_dir', value=context_dir)
|
|
||||||
context['task_instance'].xcom_push(key='account_id', value=account_id)
|
|
||||||
|
|
||||||
# Deploy the Docker service
|
|
||||||
# The 'host_for_registration' variable here represents the externally accessible IP for registration/XCom.
|
|
||||||
# The service inside the container will listen on 0.0.0.0.
|
|
||||||
logging.info(f"Deploying service for account {account_id}. Registration Host: {host_for_registration}, Port: {port}")
|
|
||||||
|
|
||||||
# Get Redis connection details ONLY if redis_enabled (for the container to register itself)
|
|
||||||
redis_host_for_container = ''
|
|
||||||
redis_port_for_container = ''
|
|
||||||
if redis_enabled:
|
|
||||||
try:
|
|
||||||
# Get connection details to pass to the container environment
|
|
||||||
redis_conn_details = get_redis_connection().connection_pool.connection_kwargs
|
|
||||||
redis_host_for_container = os.getenv('REDIS_HOST', redis_conn_details.get('host', 'redis'))
|
|
||||||
redis_port_for_container = str(os.getenv('REDIS_PORT', redis_conn_details.get('port', 6379)))
|
|
||||||
logging.info(f"Redis enabled. Passing REDIS_HOST={redis_host_for_container}, REDIS_PORT={redis_port_for_container} to container.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to get Redis connection details for container environment: {e}")
|
|
||||||
logging.warning("Proceeding without Redis details in container environment due to error.")
|
|
||||||
# Depending on container requirements, you might want to raise an error here instead
|
|
||||||
else:
|
|
||||||
logging.info("Redis disabled. Not passing REDIS_HOST/REDIS_PORT to container environment.")
|
|
||||||
|
|
||||||
|
|
||||||
# Get Docker connection details from Airflow
|
|
||||||
try:
|
|
||||||
secrets_backend = conf.get('secrets', 'backend', fallback='None')
|
|
||||||
logging.info(f"Attempting to get 'docker_hub' connection. Configured secrets backend: {secrets_backend}")
|
|
||||||
docker_conn = BaseHook.get_connection("docker_hub")
|
|
||||||
docker_username = docker_conn.login
|
|
||||||
docker_password = docker_conn.password
|
|
||||||
logging.info("Successfully retrieved 'docker_hub' connection.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to retrieve 'docker_hub' connection: {e}")
|
|
||||||
# Log details about potential secrets backend issues
|
|
||||||
secrets_backend_kwargs = conf.get('secrets', 'backend_kwargs', fallback='{}')
|
|
||||||
logging.error(f"Secrets backend details: backend={secrets_backend}, kwargs={secrets_backend_kwargs}")
|
|
||||||
# Re-raise the exception to fail the task
|
|
||||||
raise
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Initialize Docker client to connect to docker-socket-proxy
|
|
||||||
client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
|
|
||||||
|
|
||||||
# Authenticate with Docker Hub
|
|
||||||
client.login(
|
|
||||||
username=docker_username,
|
|
||||||
password=docker_password,
|
|
||||||
registry=docker_conn.host # Typically "https://index.docker.io/v1/"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate a unique container name
|
|
||||||
container_name = f"ytdlp_service_{account_id}_{uuid.uuid4().hex[:8]}"
|
|
||||||
|
|
||||||
# Pull the Docker image (if not already present)
|
|
||||||
client.images.pull('pangramia/ytdlp-ops-server:latest')
|
|
||||||
|
|
||||||
# Use the configured network name (from params or default)
|
|
||||||
network_name = docker_network # Use the retrieved parameter
|
|
||||||
logging.info(f"Attempting to run container on network: {network_name}")
|
|
||||||
|
|
||||||
# Determine if --probe flag should be added based on DAG param
|
|
||||||
exit_on_proxy_fail = context['dag_run'].conf.get('exit_on_proxy_fail', True) # Default to True if not set
|
|
||||||
command_args = [
|
|
||||||
'--script-dir', '/app/scripts',
|
|
||||||
'--context-dir', '/app/context-data', # Use the bind mount target inside container
|
|
||||||
'--port', str(port),
|
|
||||||
'--health-port', str(health_port),
|
|
||||||
'--clients', clients,
|
|
||||||
'--timeout', '120',
|
|
||||||
'--proxy', proxy if proxy else ''
|
|
||||||
]
|
|
||||||
if exit_on_proxy_fail:
|
|
||||||
command_args.append('--probe')
|
|
||||||
logging.info("Adding --probe flag to container command as exit_on_proxy_fail=True")
|
|
||||||
else:
|
|
||||||
logging.info("Not adding --probe flag to container command as exit_on_proxy_fail=False")
|
|
||||||
|
|
||||||
# Run the Docker container with health port
|
|
||||||
container = client.containers.run(
|
|
||||||
image='pangramia/ytdlp-ops-server:latest',
|
|
||||||
command=command_args, # Use the constructed command list
|
|
||||||
environment={
|
|
||||||
'PYTHONUNBUFFERED': '1', # Ensure logs are not buffered
|
|
||||||
'SERVER_PORT': str(port), # Port the service listens on *inside* the container
|
|
||||||
'SERVER_HOST': '0.0.0.0', # Service should listen on all interfaces *inside* the container
|
|
||||||
'ACCOUNT_ID': account_id,
|
|
||||||
# Pass Redis details *if enabled* for the service to register itself
|
|
||||||
'REDIS_HOST': redis_host_for_container,
|
|
||||||
'REDIS_PORT': redis_port_for_container,
|
|
||||||
# Pass PROXY_URL for health check access
|
|
||||||
'PROXY_URL': proxy if proxy else '',
|
|
||||||
},
|
|
||||||
ports={
|
|
||||||
f"{port}/tcp": port,
|
|
||||||
f"{health_port}/tcp": health_port
|
|
||||||
},
|
|
||||||
volumes={
|
|
||||||
context_dir: {'bind': '/app/context-data', 'mode': 'rw'}
|
|
||||||
},
|
|
||||||
network_mode=network_name, # Use the specified network variable
|
|
||||||
auto_remove=False, # Do not auto-remove the container
|
|
||||||
name=container_name, # Use a unique name
|
|
||||||
detach=True,
|
|
||||||
tty=True,
|
|
||||||
shm_size='256m',
|
|
||||||
# Updated healthcheck to test external connectivity via proxy
|
|
||||||
healthcheck={
|
|
||||||
# Use CMD-SHELL to allow conditional logic based on PROXY_URL env var
|
|
||||||
'test': [
|
|
||||||
'CMD-SHELL',
|
|
||||||
# Script checks if PROXY_URL is set, uses it with curl if yes, otherwise curls directly.
|
|
||||||
# -f: Fail silently (exit non-zero on error)
|
|
||||||
# --connect-timeout 10: Timeout for connection phase
|
|
||||||
# > /dev/null: Discard output, we only care about exit code
|
|
||||||
'if [ -n "$PROXY_URL" ]; then '
|
|
||||||
'curl -f --connect-timeout 10 -x "$PROXY_URL" https://ifconfig.co > /dev/null; '
|
|
||||||
'else '
|
|
||||||
'curl -f --connect-timeout 10 https://ifconfig.co > /dev/null; '
|
|
||||||
'fi'
|
|
||||||
],
|
|
||||||
'interval': 30 * 1000000000, # Check every 30 seconds (30 * 1e9 nanoseconds)
|
|
||||||
'timeout': 15 * 1000000000, # Timeout after 15 seconds (15 * 1e9 nanoseconds)
|
|
||||||
'retries': 5, # Retry 5 times on failure
|
|
||||||
'start_period': 15 * 1000000000 # Grace period of 15 seconds after start
|
|
||||||
},
|
|
||||||
# Add labels for better identification
|
|
||||||
labels={
|
|
||||||
'service': 'ytdlp',
|
|
||||||
'account_id': account_id
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Wait for container to be running (skip health check verification)
|
|
||||||
start_time = time.time()
|
|
||||||
while True:
|
|
||||||
container.reload()
|
|
||||||
if container.status == 'running':
|
|
||||||
break
|
|
||||||
if time.time() - start_time > 10: # 10 second timeout
|
|
||||||
raise TimeoutError("Container failed to start within 10 seconds")
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
logging.info(f"Container started: {container.id} (health check verification skipped)")
|
|
||||||
# Push container details immediately after creation using simplified keys
|
|
||||||
context['task_instance'].xcom_push(key='container_id', value=container.id)
|
|
||||||
context['task_instance'].xcom_push(key='container_name', value=container_name)
|
|
||||||
logging.info(f"Pushed container_id={container.id} and container_name={container_name} to XCom.")
|
|
||||||
|
|
||||||
# --- Determine Host for Sensor ---
|
|
||||||
# Get the container's IP address on the specified network for the HttpSensor
|
|
||||||
try:
|
|
||||||
container.reload() # Refresh container attributes
|
|
||||||
network_settings = container.attrs.get('NetworkSettings', {}).get('Networks', {})
|
|
||||||
if network_name in network_settings:
|
|
||||||
host_for_sensor = network_settings[network_name].get('IPAddress')
|
|
||||||
if not host_for_sensor:
|
|
||||||
raise ValueError(f"Container {container.id} has no IPAddress on network '{network_name}'")
|
|
||||||
logging.info(f"Using container IP '{host_for_sensor}' on network '{network_name}' for HttpSensor.")
|
|
||||||
else:
|
|
||||||
# Fallback or error if container not on expected network
|
|
||||||
logging.error(f"Container {container.id} is not attached to the expected network '{network_name}'. Network settings: {network_settings}")
|
|
||||||
# Option 1: Fallback to container name (might fail as observed)
|
|
||||||
# host_for_sensor = container_name
|
|
||||||
# logging.warning(f"Falling back to container name '{host_for_sensor}' for sensor.")
|
|
||||||
# Option 2: Raise error
|
|
||||||
raise ValueError(f"Container {container.id} not found on network '{network_name}'. Cannot determine IP for sensor.")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to get container IP address: {e}", exc_info=True)
|
|
||||||
raise AirflowException(f"Failed to determine IP address for HttpSensor: {e}")
|
|
||||||
|
|
||||||
# Ensure we don't use 0.0.0.0 or empty string for the sensor
|
|
||||||
if not host_for_sensor or host_for_sensor == "0.0.0.0":
|
|
||||||
raise ValueError(f"Determined host_for_sensor is invalid ('{host_for_sensor}'). Check container network attachment and IP assignment.")
|
|
||||||
|
|
||||||
# --- Add extra logging before pushing ---
|
|
||||||
logging.info(f"FINAL CHECK before XCom push:")
|
|
||||||
logging.info(f" Account ID: {account_id}")
|
|
||||||
logging.info(f" Host for Sensor (IP Address): {host_for_sensor}")
|
|
||||||
logging.info(f" Host for Registration: {host_for_registration}")
|
|
||||||
logging.info(f" Service Port: {port}")
|
|
||||||
logging.info(f" Health Port: {health_port}")
|
|
||||||
logging.info(f" Pushing to XCom key: service_host with value: {host_for_sensor}")
|
|
||||||
# --- End extra logging ---
|
|
||||||
|
|
||||||
# Push distinct service connection details using simplified keys
|
|
||||||
context['task_instance'].xcom_push(key='service_host_registration', value=host_for_registration) # For client discovery (e.g., Redis)
|
|
||||||
context['task_instance'].xcom_push(key='service_host', value=host_for_sensor) # IP Address for HttpSensor
|
|
||||||
context['task_instance'].xcom_push(key='service_port', value=port) # Port is the same
|
|
||||||
context['task_instance'].xcom_push(key='service_health_port', value=health_port) # Health port is the same
|
|
||||||
logging.info(f"Pushed host_for_sensor (IP Address)={host_for_sensor} to XCom key 'service_host'")
|
|
||||||
logging.info(f"Pushed host_for_registration={host_for_registration} to XCom key 'service_host_registration'")
|
|
||||||
|
|
||||||
|
|
||||||
# Store account metadata in Redis only if redis_enabled is True
|
|
||||||
# This uses the 'host_for_registration' for client discovery
|
|
||||||
if redis_enabled:
|
|
||||||
store_account_metadata(account_id, host_for_registration, port, proxy, health_port, container.id)
|
|
||||||
|
|
||||||
# If we reach here, deployment is considered successful for now
|
|
||||||
logging.info("Deployment preparation successful.")
|
|
||||||
# Return values are implicitly pushed to XCom (but we pushed explicitly above)
|
|
||||||
return context_dir, host_for_registration, port
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error during service deployment: {e}", exc_info=True)
|
|
||||||
# Attempt to cleanup the container if it was created before the error
|
|
||||||
try:
|
|
||||||
if 'container' in locals() and container and container.id:
|
|
||||||
logging.warning(f"Attempting to stop and remove container {container.id} due to deployment error.")
|
|
||||||
container.stop(timeout=5)
|
|
||||||
container.remove(force=True)
|
|
||||||
logging.info(f"Successfully stopped and removed container {container.id} after error.")
|
|
||||||
elif 'container_name' in locals() and container_name:
|
|
||||||
# Try finding by name if ID wasn't captured
|
|
||||||
containers = client.containers.list(filters={'name': container_name})
|
|
||||||
if containers:
|
|
||||||
logging.warning(f"Attempting to stop and remove container {containers[0].name} by name due to deployment error.")
|
|
||||||
containers[0].stop(timeout=5)
|
|
||||||
containers[0].remove(force=True)
|
|
||||||
logging.info(f"Successfully stopped and removed container {containers[0].name} after error.")
|
|
||||||
except Exception as cleanup_err:
|
|
||||||
logging.error(f"Failed during post-error container cleanup: {cleanup_err}")
|
|
||||||
raise # Re-raise the original exception to fail the task
|
|
||||||
|
|
||||||
# Removed the old monitor_health PythonOperator
|
|
||||||
|
|
||||||
# stop_service and cleanup_service are now defined directly in the DAG below.
|
|
||||||
|
|
||||||
def check_service_health(ti=None, **context):
|
|
||||||
"""
|
|
||||||
Periodically checks the service's /health endpoint using requests.
|
|
||||||
Acts as a long-running sentinel task. Fails if the health check fails
|
|
||||||
repeatedly or times out.
|
|
||||||
"""
|
|
||||||
# Get parameters from XCom
|
|
||||||
host_reg = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_host_registration')
|
|
||||||
host_svc = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_host')
|
|
||||||
health_port = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_health_port')
|
|
||||||
|
|
||||||
# Determine the host to use (prioritize registration host)
|
|
||||||
host = host_reg if host_reg and host_reg != '0.0.0.0' else host_svc
|
|
||||||
if not host or not health_port:
|
|
||||||
raise AirflowException("Could not retrieve host or health_port from XCom for health check.")
|
|
||||||
|
|
||||||
health_url = f"http://{host}:{health_port}/health"
|
|
||||||
logging.info(f"Starting health check for: {health_url}")
|
|
||||||
|
|
||||||
# Get configuration for polling
|
|
||||||
# Use task's execution_timeout if available, otherwise default to 1 year
|
|
||||||
task_timeout = ti.task.execution_timeout or timedelta(days=365)
|
|
||||||
poke_interval = 60 # Check every 60 seconds (adjust as needed)
|
|
||||||
start_time = time.monotonic()
|
|
||||||
timeout_seconds = task_timeout.total_seconds()
|
|
||||||
consecutive_error_start_time = None # Track start time of consecutive connection errors
|
|
||||||
error_retry_window = 10 # Seconds to retry connection errors before failing
|
|
||||||
|
|
||||||
while True:
|
|
||||||
current_time = time.monotonic()
|
|
||||||
if current_time - start_time > timeout_seconds:
|
|
||||||
raise AirflowException(f"Health check timed out after {timeout_seconds} seconds for {health_url}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Use a reasonable timeout for the individual request
|
|
||||||
response = requests.get(health_url, timeout=15) # 15 second request timeout
|
|
||||||
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
|
|
||||||
|
|
||||||
# Check response content if needed (optional)
|
|
||||||
# Example: Check for specific JSON content
|
|
||||||
# try:
|
|
||||||
# data = response.json()
|
|
||||||
# if data.get("status") == "healthy":
|
|
||||||
# logging.info(f"Health check successful: Status {response.status_code}")
|
|
||||||
# else:
|
|
||||||
# logging.warning(f"Health check OK (Status {response.status_code}), but content unexpected: {data}")
|
|
||||||
# except requests.exceptions.JSONDecodeError:
|
|
||||||
# logging.warning(f"Health check OK (Status {response.status_code}), but response is not valid JSON.")
|
|
||||||
|
|
||||||
# If we got a 2xx status, log success and reset error timer if needed
|
|
||||||
if consecutive_error_start_time is not None:
|
|
||||||
logging.info(f"Connection to {health_url} recovered.")
|
|
||||||
consecutive_error_start_time = None
|
|
||||||
logging.info(f"Health check successful: Status {response.status_code} for {health_url}")
|
|
||||||
|
|
||||||
except requests.exceptions.Timeout:
|
|
||||||
current_monotonic_time = time.monotonic()
|
|
||||||
if consecutive_error_start_time is None:
|
|
||||||
consecutive_error_start_time = current_monotonic_time
|
|
||||||
logging.warning(f"Health check request timed out for {health_url}. Starting {error_retry_window}s retry window...")
|
|
||||||
else:
|
|
||||||
elapsed_error_time = current_monotonic_time - consecutive_error_start_time
|
|
||||||
if elapsed_error_time > error_retry_window:
|
|
||||||
error_msg = f"Health check failed for {health_url}: Timeout persisted for over {error_retry_window} seconds."
|
|
||||||
logging.error(error_msg)
|
|
||||||
raise AirflowException(error_msg)
|
|
||||||
else:
|
|
||||||
logging.warning(f"Health check request timed out for {health_url}. Retrying within {error_retry_window}s window ({elapsed_error_time:.1f}s elapsed)...")
|
|
||||||
|
|
||||||
except requests.exceptions.ConnectionError as e:
|
|
||||||
# Check if the error is specifically "Connection refused" - fail immediately
|
|
||||||
if "[Errno 111] Connection refused" in str(e):
|
|
||||||
logging.error(f"Health check failed for {health_url}: Connection refused. Failing task immediately.")
|
|
||||||
raise AirflowException(f"Health check failed for {health_url}: Connection refused")
|
|
||||||
else:
|
|
||||||
# Handle other connection errors with the retry window
|
|
||||||
current_monotonic_time = time.monotonic()
|
|
||||||
if consecutive_error_start_time is None:
|
|
||||||
consecutive_error_start_time = current_monotonic_time
|
|
||||||
logging.warning(f"Health check connection error for {health_url}: {e}. Starting {error_retry_window}s retry window...")
|
|
||||||
else:
|
|
||||||
elapsed_error_time = current_monotonic_time - consecutive_error_start_time
|
|
||||||
if elapsed_error_time > error_retry_window:
|
|
||||||
error_msg = f"Health check failed for {health_url}: Connection error persisted for over {error_retry_window} seconds. Last error: {e}"
|
|
||||||
logging.error(error_msg)
|
|
||||||
raise AirflowException(error_msg)
|
|
||||||
else:
|
|
||||||
logging.warning(f"Health check connection error for {health_url}: {e}. Retrying within {error_retry_window}s window ({elapsed_error_time:.1f}s elapsed)...")
|
|
||||||
|
|
||||||
except requests.exceptions.HTTPError as e:
|
|
||||||
# This catches 4xx/5xx errors - fail immediately
|
|
||||||
logging.error(f"Health check failed for {health_url}: Status {e.response.status_code}. Failing task.")
|
|
||||||
# Fail the task immediately on HTTP error
|
|
||||||
raise AirflowException(f"Health check failed for {health_url}: Status {e.response.status_code}")
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
logging.error(f"Health check failed for {health_url} with unexpected error: {e}. Failing task.")
|
|
||||||
# Fail the task immediately on other request errors
|
|
||||||
raise AirflowException(f"Health check failed for {health_url}: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
# Catch any other unexpected errors during the check
|
|
||||||
logging.error(f"Unexpected error during health check for {health_url}: {e}", exc_info=True)
|
|
||||||
raise AirflowException(f"Unexpected error during health check: {e}")
|
|
||||||
|
|
||||||
# Wait for the poke interval before the next check
|
|
||||||
time.sleep(poke_interval)
|
|
||||||
|
|
||||||
|
|
||||||
def _wait_forever():
|
|
||||||
"""Sleeps indefinitely (or until task timeout) to simulate a running service."""
|
|
||||||
logging.info("Sentinel task started. Sleeping in a loop...")
|
|
||||||
# Sleep in a loop with a reasonable interval to avoid OverflowError
|
|
||||||
# The task will keep running until it times out based on execution_timeout
|
|
||||||
# or is manually stopped/failed.
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
# Sleep for a long interval (e.g., 1 day)
|
|
||||||
# You can adjust this interval if needed.
|
|
||||||
time.sleep(86400) # Sleep for 24 hours
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
logging.info("Sentinel task interrupted. Exiting.")
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
# Log other potential errors during sleep, though unlikely
|
|
||||||
logging.error(f"Error during sentinel sleep loop: {e}")
|
|
||||||
# Optionally break or continue based on error handling strategy
|
|
||||||
break # Exit loop on unexpected error
|
|
||||||
|
|
||||||
def stop_service(**context):
|
|
||||||
"""Stop the running Docker container with verification."""
|
|
||||||
# Retrieve account_id from params or kwargs
|
|
||||||
account_id = context.get('params', {}).get('account_id') or context.get('account_id')
|
|
||||||
if not account_id:
|
|
||||||
raise ValueError("Account ID is missing.")
|
|
||||||
|
|
||||||
# Initialize Docker client to connect to docker-socket-proxy
|
|
||||||
client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
|
|
||||||
|
|
||||||
try:
|
|
||||||
# For testing, try to get container ID from environment if XCom is not available
|
|
||||||
container_id = None
|
|
||||||
if 'ti' in context:
|
|
||||||
# Use simplified XCom key
|
|
||||||
container_id = context['ti'].xcom_pull(task_ids='prepare_and_deploy', key='container_id')
|
|
||||||
|
|
||||||
if not container_id:
|
|
||||||
# If not found in XCom, try to find container by account_id pattern (keep this fallback)
|
|
||||||
containers = client.containers.list(filters={"name": f"ytdlp_service_{account_id}"})
|
|
||||||
if containers:
|
|
||||||
container = containers[0]
|
|
||||||
container_id = container.id
|
|
||||||
logging.info(f"Found container by name pattern: {container.name} (ID: {container_id})")
|
|
||||||
else:
|
|
||||||
logging.warning(f"No container found for account {account_id} - nothing to stop")
|
|
||||||
return
|
|
||||||
|
|
||||||
if container_id:
|
|
||||||
# If found in XCom, stop by container ID
|
|
||||||
container = client.containers.get(container_id)
|
|
||||||
|
|
||||||
# Verify container is running before stopping
|
|
||||||
if container.status != 'running':
|
|
||||||
logging.warning(f"Container {container_id} is not running (status: {container.status})")
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.info(f"Stopping container {container_id}...")
|
|
||||||
container.stop(timeout=10) # 10 second timeout
|
|
||||||
|
|
||||||
# Verify container is stopped
|
|
||||||
container.reload()
|
|
||||||
if container.status == 'exited':
|
|
||||||
logging.info(f"Successfully stopped container {container_id}")
|
|
||||||
else:
|
|
||||||
logging.error(f"Container {container_id} failed to stop (status: {container.status})")
|
|
||||||
raise RuntimeError(f"Container {container_id} failed to stop")
|
|
||||||
|
|
||||||
# Clear Redis entries only if redis_enabled is True
|
|
||||||
# Retrieve redis_enabled status from DAG run conf or params
|
|
||||||
redis_enabled = context['dag_run'].conf.get('redis_enabled', False) or context['params'].get('redis_enabled', False)
|
|
||||||
if redis_enabled:
|
|
||||||
redis_client = get_redis_connection()
|
|
||||||
try:
|
|
||||||
# Verify Redis connection
|
|
||||||
if not redis_client.ping():
|
|
||||||
raise ConnectionError("Failed to connect to Redis")
|
|
||||||
|
|
||||||
# Remove main metadata
|
|
||||||
redis_client.delete(f"ytdlp:{account_id}")
|
|
||||||
# Remove from accounts set
|
|
||||||
redis_client.srem("ytdlp:accounts", account_id)
|
|
||||||
logging.info(f"Successfully cleared Redis entries for account: {account_id}")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to clear Redis entries for account {account_id}: {e}")
|
|
||||||
# Do not raise here, allow container stop to be considered successful
|
|
||||||
# raise # Optional: re-raise if Redis cleanup failure should fail the task
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.warning(f"No container found for account {account_id} - nothing to stop")
|
|
||||||
|
|
||||||
except docker.errors.NotFound as e:
|
|
||||||
logging.warning(f"Container for account {account_id} not found: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to stop container: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup_service(**context):
|
|
||||||
"""Cleanup service resources including Redis entries and XCom data."""
|
|
||||||
# Note: This function is now called within the manual_stop_cleanup TaskGroup
|
|
||||||
try:
|
|
||||||
# Retrieve account_id from params first, then from XCom
|
|
||||||
account_id = context['params'].get('account_id')
|
|
||||||
if not account_id:
|
|
||||||
# Try to get it from XCom
|
|
||||||
account_id = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='account_id')
|
|
||||||
if not account_id:
|
|
||||||
logging.warning("Account ID not found in params or XCom - skipping resource cleanup")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Redis cleanup (if redis_enabled=True) is handled in the 'stop_service' task.
|
|
||||||
logging.info(f"Redis cleanup for account {account_id} is handled by the 'stop_service' task if enabled.")
|
|
||||||
|
|
||||||
# Cleanup XCom data (using simplified keys where applicable)
|
|
||||||
# Note: XCom cleanup is generally not strictly necessary but can be good practice.
|
|
||||||
# Airflow manages XCom expiry. This code doesn't actually *delete* XComs.
|
|
||||||
# To truly delete, you'd use the Airflow API or DB directly.
|
|
||||||
# We'll leave the pull calls here as they don't harm anything.
|
|
||||||
ti = context['task_instance']
|
|
||||||
ti.xcom_pull(key='container_id', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='container_name', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='service_host_registration', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='service_host', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='service_port', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='service_health_port', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='work_id', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='context_dir', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='account_id', task_ids='prepare_and_deploy', include_prior_dates=True) # Keep account_id pull
|
|
||||||
logging.info(f"Pulled XCom data for potential cleanup logging for account: {account_id}")
|
|
||||||
|
|
||||||
# Initialize Docker client
|
|
||||||
client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
|
|
||||||
container_found_and_removed = False
|
|
||||||
|
|
||||||
# Attempt 1: Get container ID from XCom using simplified key
|
|
||||||
container_id_xcom = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='container_id')
|
|
||||||
if container_id_xcom:
|
|
||||||
logging.info(f"Attempting to remove container using XCom ID: {container_id_xcom}")
|
|
||||||
try:
|
|
||||||
container = client.containers.get(container_id_xcom)
|
|
||||||
logging.info(f"Found container {container.id} (Name: {container.name}). Removing...")
|
|
||||||
container.remove(force=True)
|
|
||||||
logging.info(f"Successfully removed container {container.id}")
|
|
||||||
container_found_and_removed = True
|
|
||||||
except docker.errors.NotFound:
|
|
||||||
logging.warning(f"Container with XCom ID {container_id_xcom} not found. Trying other methods.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error removing container {container_id_xcom}: {e}")
|
|
||||||
|
|
||||||
# Attempt 2: Find container by labels if not found/removed via XCom ID
|
|
||||||
if not container_found_and_removed:
|
|
||||||
logging.info(f"Attempting to find and remove container by labels: service=ytdlp, account_id={account_id}")
|
|
||||||
try:
|
|
||||||
containers = client.containers.list(
|
|
||||||
filters={'label': [f'service=ytdlp', f'account_id={account_id}']},
|
|
||||||
all=True # Include stopped containers
|
|
||||||
)
|
|
||||||
if containers:
|
|
||||||
for container in containers:
|
|
||||||
logging.info(f"Found container {container.id} (Name: {container.name}) by labels. Removing...")
|
|
||||||
try:
|
|
||||||
container.remove(force=True)
|
|
||||||
logging.info(f"Successfully removed container {container.id}")
|
|
||||||
container_found_and_removed = True # Mark as found even if only one is removed
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error removing container {container.id} found by labels: {e}")
|
|
||||||
else:
|
|
||||||
logging.info("No containers found matching labels.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error searching for containers by labels: {e}")
|
|
||||||
|
|
||||||
# Attempt 3: Find container by name pattern if still not found/removed
|
|
||||||
if not container_found_and_removed:
|
|
||||||
container_name_pattern = f"ytdlp_service_{account_id}_*"
|
|
||||||
logging.info(f"Attempting to find and remove container by name pattern: {container_name_pattern}")
|
|
||||||
try:
|
|
||||||
containers = client.containers.list(filters={'name': container_name_pattern}, all=True)
|
|
||||||
if containers:
|
|
||||||
for container in containers:
|
|
||||||
logging.info(f"Found container {container.id} (Name: {container.name}) by name pattern. Removing...")
|
|
||||||
try:
|
|
||||||
container.remove(force=True)
|
|
||||||
logging.info(f"Successfully removed container {container.id}")
|
|
||||||
container_found_and_removed = True
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error removing container {container.id} found by name: {e}")
|
|
||||||
else:
|
|
||||||
logging.info("No containers found matching name pattern.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error searching for containers by name: {e}")
|
|
||||||
|
|
||||||
if not container_found_and_removed:
|
|
||||||
logging.warning(f"Could not find or remove any container for account {account_id} using ID, labels, or name.")
|
|
||||||
|
|
||||||
# Get context directory from XCom and remove it
|
|
||||||
context_dir = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='context_dir')
|
|
||||||
if context_dir and os.path.exists(context_dir):
|
|
||||||
shutil.rmtree(context_dir)
|
|
||||||
logging.info(f"Cleaned up working directory: {context_dir}")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error during cleanup: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Define the DAG
|
|
||||||
with DAG(
|
|
||||||
'ytdlp_service',
|
|
||||||
default_args=default_args,
|
|
||||||
description='Deploy YTDLP token service for ios, android, mweb',
|
|
||||||
schedule_interval=None,
|
|
||||||
start_date=days_ago(1), # Use dynamic start date for manually triggered DAG
|
|
||||||
catchup=False,
|
|
||||||
tags=['youtube', 'tokens', 'service', 'docker'],
|
|
||||||
# executor_config moved to default_args
|
|
||||||
is_paused_upon_creation=False,
|
|
||||||
params={
|
|
||||||
'account_id': Param(
|
|
||||||
'account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09',
|
|
||||||
type="string",
|
|
||||||
description="Required: The account ID for which the service is being deployed."
|
|
||||||
),
|
|
||||||
'proxy': Param(
|
|
||||||
'socks5://sslocal-rust-1084:1084',
|
|
||||||
type=["null", "string"],
|
|
||||||
description="Optional: The SOCKS5 proxy URL to use for the service (e.g., socks5://host:port)."
|
|
||||||
),
|
|
||||||
'clients': Param(
|
|
||||||
'ios,android,mweb',
|
|
||||||
type="string",
|
|
||||||
description="Comma-separated list of client types (e.g., ios,android,mweb)."
|
|
||||||
),
|
|
||||||
'redis_enabled': Param(
|
|
||||||
False,
|
|
||||||
type="boolean",
|
|
||||||
description="Use Redis for service discovery? If False, host/port must be provided or will be auto-assigned."
|
|
||||||
),
|
|
||||||
'host': Param(
|
|
||||||
None,
|
|
||||||
type=["null", "string"],
|
|
||||||
description="Optional: Host IP for the service. If redis_enabled=False and host is not provided, defaults to '0.0.0.0'. If redis_enabled=True and host is not provided, uses HOST_EXTERNAL_IP or defaults to '0.0.0.0'."
|
|
||||||
),
|
|
||||||
'port': Param(
|
|
||||||
None,
|
|
||||||
type=["null", "integer"],
|
|
||||||
description="Optional: Port for the service. If None, a free port will be assigned automatically. If redis_enabled=False and a port is provided, it will be used (after checking availability)."
|
|
||||||
),
|
|
||||||
# redis_host and redis_port parameters are removed.
|
|
||||||
# If redis_enabled=True, the DAG will use the 'redis_default' Airflow connection.
|
|
||||||
'docker_network': Param(
|
|
||||||
'airflow_prod_proxynet',
|
|
||||||
type="string",
|
|
||||||
description="Optional: The Docker network to attach the container to. Defaults to 'airflow_prod_proxynet'."
|
|
||||||
),
|
|
||||||
'exit_on_proxy_fail': Param(
|
|
||||||
True,
|
|
||||||
type="boolean",
|
|
||||||
description="Exit the service container immediately if the initial proxy test fails?"
|
|
||||||
),
|
|
||||||
}
|
|
||||||
) as dag:
|
|
||||||
|
|
||||||
# Task to prepare and deploy the service
|
|
||||||
prepare_and_deploy = PythonOperator(
|
|
||||||
task_id='prepare_and_deploy',
|
|
||||||
python_callable=prepare_and_deploy_service,
|
|
||||||
provide_context=True,
|
|
||||||
trigger_rule='all_success' # Keep default trigger rule for prepare_and_deploy
|
|
||||||
)
|
|
||||||
|
|
||||||
# Combined Health Check and Sentinel Task using PythonOperator
|
|
||||||
# This task runs for a long time, checking health periodically using the 'requests' library.
|
|
||||||
# If the health check fails repeatedly or times out, the task fails, triggering 'stop_service'.
|
|
||||||
monitor_service_health = PythonOperator(
|
|
||||||
task_id='monitor_service_health',
|
|
||||||
python_callable=check_service_health,
|
|
||||||
provide_context=True,
|
|
||||||
# Set execution timeout for the task itself (acts as the overall timeout)
|
|
||||||
execution_timeout=timedelta(days=365), # Long timeout (e.g., 1 year)
|
|
||||||
# op_kwargs can pass static config, but host/port come from XCom inside the function
|
|
||||||
# poke_interval and request timeout are handled within check_service_health
|
|
||||||
)
|
|
||||||
monitor_service_health.doc_md = """
|
|
||||||
### Monitor Service Health Task (PythonOperator)
|
|
||||||
Uses a Python function to periodically check the service's `/health` endpoint using the `requests` library.
|
|
||||||
Acts as both a health check and a sentinel for the running service.
|
|
||||||
- **Pulls from XCom:** Reads `service_host_registration`, `service_host`, and `service_health_port` from the `prepare_and_deploy` task to construct the target URL.
|
|
||||||
- **Polling:** Checks the `/health` endpoint every 60 seconds.
|
|
||||||
- **Timeout:** Uses the task's `execution_timeout` (set to 1 year) as the overall maximum duration. Individual requests have a 15-second timeout.
|
|
||||||
- **Failure:** If a health check request returns a 4xx/5xx status code or encounters other request errors, the task fails immediately. If the overall `execution_timeout` is reached without a failure, the task would eventually time out and fail.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Task to stop the service (runs if monitor_service_health fails)
|
|
||||||
stop = PythonOperator(
|
|
||||||
task_id='stop_service',
|
|
||||||
python_callable=stop_service,
|
|
||||||
provide_context=True,
|
|
||||||
trigger_rule=TriggerRule.ONE_FAILED # Run only if monitor_service_health fails
|
|
||||||
)
|
|
||||||
stop.doc_md = """
|
|
||||||
### Stop Service Task
|
|
||||||
Stops the Docker container associated with the service.
|
|
||||||
- **Trigger Rule:** `one_failed` - This task only runs if the upstream `monitor_service_health` task fails.
|
|
||||||
- Pulls container ID/name from XCom or finds it using labels/name patterns.
|
|
||||||
- Clears Redis entries if `redis_enabled=True`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Marker task to indicate that the deployment failed
|
|
||||||
prepare_failed_marker = EmptyOperator(
|
|
||||||
task_id='prepare_failed_marker',
|
|
||||||
trigger_rule=TriggerRule.ONE_FAILED # Run only if 'prepare_and_deploy' fails
|
|
||||||
)
|
|
||||||
|
|
||||||
# Task to cleanup resources (runs after stop sequence OR if prepare fails)
|
|
||||||
cleanup = PythonOperator(
|
|
||||||
task_id='cleanup_service',
|
|
||||||
python_callable=cleanup_service,
|
|
||||||
provide_context=True,
|
|
||||||
trigger_rule=TriggerRule.ALL_DONE # Run after upstream (stop or prepare_failed_marker) is done
|
|
||||||
)
|
|
||||||
cleanup.doc_md = """
|
|
||||||
### Cleanup Service Task
|
|
||||||
Removes the Docker container and cleans up related resources.
|
|
||||||
- **Trigger Rule:** `all_done` - Runs after the `stop_service` task finishes, whether it succeeded or failed.
|
|
||||||
- Removes the container using ID from XCom, labels, or name patterns.
|
|
||||||
- Cleans up XCom variables.
|
|
||||||
- Removes the context directory.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Define task dependencies
|
|
||||||
# Success Path: prepare -> monitor (runs indefinitely)
|
|
||||||
# Monitor Failure Path: monitor (fails) -> stop -> cleanup
|
|
||||||
# Prepare Failure Path: prepare (fails) -> prepare_failed_marker -> cleanup
|
|
||||||
|
|
||||||
prepare_and_deploy >> monitor_service_health
|
|
||||||
prepare_and_deploy >> prepare_failed_marker # Trigger marker if prepare fails
|
|
||||||
|
|
||||||
monitor_service_health >> stop # Trigger stop if monitor fails
|
|
||||||
|
|
||||||
# Cleanup is triggered after stop finishes OR after prepare_failed_marker finishes
|
|
||||||
stop >> cleanup
|
|
||||||
prepare_failed_marker >> cleanup
|
|
||||||
|
|
||||||
@ -1,21 +1,21 @@
|
|||||||
version: '3.8'
|
|
||||||
|
|
||||||
services:
|
services:
|
||||||
camoufox:
|
camoufox:
|
||||||
build:
|
build:
|
||||||
context: ./camoufox # Path relative to the docker-compose file
|
context: ./camoufox # Path relative to the docker-compose file
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
|
args:
|
||||||
|
VNC_PASSWORD: ${VNC_PASSWORD:-supersecret} # Use environment variable or default
|
||||||
ports:
|
ports:
|
||||||
# Optionally expose the camoufox port to the host for debugging
|
# Optionally expose the camoufox port to the host for debugging
|
||||||
- "12345:12345"
|
- "12345:12345"
|
||||||
- "5900:5900" # Expose VNC port to the host, still not working
|
- "5900:5900" # Expose VNC port to the host
|
||||||
networks:
|
networks:
|
||||||
- airflow_prod_proxynet
|
- airflow_prod_proxynet
|
||||||
command: [
|
command: [
|
||||||
"--ws-host", "0.0.0.0",
|
"--ws-host", "0.0.0.0",
|
||||||
"--port", "12345",
|
"--port", "12345",
|
||||||
"--ws-path", "mypath",
|
"--ws-path", "mypath",
|
||||||
"--proxy-url", "socks5://sslocal-rust-1082:1082",
|
"--proxy-url", "socks5://sslocal-rust-1084:1084",
|
||||||
"--locale", "en-US",
|
"--locale", "en-US",
|
||||||
"--geoip",
|
"--geoip",
|
||||||
"--extensions", "/app/extensions/google_sign_in_popup_blocker-1.0.2.xpi,/app/extensions/spoof_timezone-0.3.4.xpi,/app/extensions/youtube_ad_auto_skipper-0.6.0.xpi"
|
"--extensions", "/app/extensions/google_sign_in_popup_blocker-1.0.2.xpi,/app/extensions/spoof_timezone-0.3.4.xpi,/app/extensions/youtube_ad_auto_skipper-0.6.0.xpi"
|
||||||
@ -35,8 +35,6 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- airflow_prod_proxynet
|
- airflow_prod_proxynet
|
||||||
command:
|
command:
|
||||||
- "--script-dir"
|
|
||||||
- "/app/scripts"
|
|
||||||
- "--context-dir"
|
- "--context-dir"
|
||||||
- "/app/context-data"
|
- "/app/context-data"
|
||||||
- "--port"
|
- "--port"
|
||||||
@ -44,8 +42,8 @@ services:
|
|||||||
- "--clients"
|
- "--clients"
|
||||||
# Add 'web' client since we now have camoufox, test firstly
|
# Add 'web' client since we now have camoufox, test firstly
|
||||||
- "web,ios,android,mweb"
|
- "web,ios,android,mweb"
|
||||||
- "--proxy"
|
- "--proxies"
|
||||||
- "socks5://sslocal-rust-1082:1082"
|
- "socks5://sslocal-rust-1081:1081,socks5://sslocal-rust-1082:1082,socks5://sslocal-rust-1083:1083,socks5://sslocal-rust-1084:1084,socks5://sslocal-rust-1085:1085"
|
||||||
# Add the endpoint argument pointing to the camoufox service
|
# Add the endpoint argument pointing to the camoufox service
|
||||||
- "--endpoint"
|
- "--endpoint"
|
||||||
- "ws://camoufox:12345/mypath"
|
- "ws://camoufox:12345/mypath"
|
||||||
@ -54,6 +52,15 @@ services:
|
|||||||
- "--camouflage-only"
|
- "--camouflage-only"
|
||||||
# Add flag to print full tokens in logs by default
|
# Add flag to print full tokens in logs by default
|
||||||
- "--print-tokens"
|
- "--print-tokens"
|
||||||
|
# Add server identity and Redis connection details
|
||||||
|
- "--server-identity"
|
||||||
|
- "ytdlp-ops-airflow-service"
|
||||||
|
- "--redis-host"
|
||||||
|
- "${REDIS_HOST:-redis}"
|
||||||
|
- "--redis-port"
|
||||||
|
- "${REDIS_PORT:-6379}"
|
||||||
|
- "--redis-password"
|
||||||
|
- "${REDIS_PASSWORD}"
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
pull_policy: always
|
pull_policy: always
|
||||||
|
|
||||||
|
|||||||
@ -1,8 +1,11 @@
|
|||||||
thrift>=0.16.0,<=0.20.0
|
|
||||||
python-dotenv==1.0.1
|
|
||||||
psutil
|
|
||||||
flask
|
flask
|
||||||
|
psutil
|
||||||
|
PySocks>=1.7.0
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
redis>=4.0.0
|
||||||
|
requests>=2.31.0
|
||||||
|
tabulate>=0.9.0
|
||||||
|
thrift>=0.16.0,<=0.20.0
|
||||||
waitress
|
waitress
|
||||||
yt_dlp>=2025.3.27
|
yt_dlp>=2025.3.27
|
||||||
yt-dlp-get-pot==0.3.0
|
yt-dlp-get-pot==0.3.0
|
||||||
requests>=2.31.0
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user