Local changes over multiple fixes, to skip url on bad formats, to allow send to aria2c service, to adopt ban cli policy testing, pass throught lang and headers from airflow dags if needed

This commit is contained in:
aperez 2025-12-01 20:27:50 +03:00
parent 302282365e
commit 336438d4cc
40 changed files with 2848 additions and 1293 deletions

View File

@ -0,0 +1,126 @@
# _ _ ____ ____
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
#
# https://github.com/P3TERX/Aria2-Pro-Docker
#
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
#
# This is free software, licensed under the MIT License.
# See /LICENSE for more information.
# Using Debian Bullseye as a more stable base than EOL Alpine
FROM debian:bullseye-slim
# Install s6-overlay and build aria2 in a single layer to reduce image size
# renovate: datasource=github-releases depName=just-containers/s6-overlay
ARG S6_OVERLAY_VERSION=v3.1.6.2
RUN BUILD_DEPS=" \
build-essential \
autoconf \
automake \
autotools-dev \
libtool \
pkg-config \
git \
gettext \
autopoint \
gettext-base \
libssl-dev \
libssh2-1-dev \
libc-ares-dev \
libexpat1-dev \
libc-ares-dev \
vim \
libexpat1 \
zlib1g-dev \
libsqlite3-dev \
" && \
apt-get update && \
apt-get install -y --no-install-recommends \
jq \
findutils \
ca-certificates \
curl \
xz-utils \
dos2unix \
$BUILD_DEPS && \
curl -sSL https://github.com/just-containers/s6-overlay/releases/download/${S6_OVERLAY_VERSION}/s6-overlay-noarch.tar.xz -o /tmp/s6-overlay-noarch.tar.xz && \
curl -sSL https://github.com/just-containers/s6-overlay/releases/download/${S6_OVERLAY_VERSION}/s6-overlay-x86_64.tar.xz -o /tmp/s6-overlay-x86_64.tar.xz && \
tar -C / -Jxpf /tmp/s6-overlay-noarch.tar.xz && \
tar -C / -Jxpf /tmp/s6-overlay-x86_64.tar.xz && \
git clone https://github.com/aria2/aria2.git /tmp/aria2 && \
cd /tmp/aria2 && \
git checkout 8985d66e71f980e7d2765753800078f47761f1ba && \
sed -i "s/\"1\", 1, 16, 'x'));/\"1\", 1, 128, 'x'));/" src/OptionHandlerFactory.cc && \
autoreconf -i && \
./configure \
--disable-dependency-tracking \
--enable-static \
--disable-shared \
--with-ca-bundle=/etc/ssl/certs/ca-certificates.crt \
--without-libxml2 \
--with-libexpat \
--without-libgcrypt \
--with-openssl \
--with-libcares \
--with-libsqlite3 \
--with-libssh2 \
--with-zlib && \
make -j$(nproc) && \
make install && \
cd / && \
# No purge runtime dev apt-get purge -y --auto-remove $BUILD_DEPS && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*
COPY rootfs /
RUN find /etc/cont-init.d /etc/services.d -type f -exec dos2unix {} + && \
find /etc/cont-init.d /etc/services.d -type f -exec chmod +x {} +
ENV S6_BEHAVIOUR_IF_STAGE2_FAILS=1 \
RCLONE_CONFIG=/config/rclone.conf \
UPDATE_TRACKERS=true \
CUSTOM_TRACKER_URL= \
LISTEN_PORT=6888 \
RPC_PORT=6800 \
RPC_SECRET= \
PUID= PGID= \
DISK_CACHE= \
IPV6_MODE= \
UMASK_SET= \
SPECIAL_MODE=
EXPOSE \
6800 \
6888 \
6888/udp
VOLUME \
/config \
/downloads
#ENTRYPOINT ["/init"]
CMD ["aria2c", \
"--enable-rpc=true", \
"--rpc-listen-all=true", \
"--rpc-listen-port=6800", \
"--listen-port=6888", \
"--disable-ipv6=true", \
"--max-concurrent-downloads=128", \
"--max-connection-per-server=32", \
"--split=6", \
"--min-split-size=2M", \
"--file-allocation=falloc", \
"--continue=false", \
"--check-integrity=false", \
"--log-level=info", \
"--console-log-level=info", \
"--save-session-interval=5", \
"--dir=/downloads", \
"--disk-cache=64M", \
"--input-file=/config/aria2.session", \
"--save-session=/config/aria2.session"]

View File

@ -0,0 +1,17 @@
----------------------------------------------------------------
█████╗ ██████╗ ██╗ █████╗ ██████╗ ██████╗ ██████╗ ██████╗
██╔══██╗██╔══██╗██║██╔══██╗╚════██╗ ██╔══██╗██╔══██╗██╔═══██╗
███████║██████╔╝██║███████║ █████╔╝ ██████╔╝██████╔╝██║ ██║
██╔══██║██╔══██╗██║██╔══██║██╔═══╝ ██╔═══╝ ██╔══██╗██║ ██║
██║ ██║██║ ██║██║██║ ██║███████╗ ██║ ██║ ██║╚██████╔╝
╚═╝ ╚═╝╚═╝ ╚═╝╚═╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝
https://github.com/P3TERX/Aria2-Pro-Docker
Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
Version: COMMIT_HASH | Build Time: DATE_TIME
----------------------------------------------------------------

View File

@ -0,0 +1,39 @@
#!/usr/bin/with-contenv bash
# _ _ ____ ____
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
#
# https://github.com/P3TERX/Aria2-Pro-Docker
#
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
#
# This is free software, licensed under the MIT License.
# See /LICENSE for more information.
. /etc/init-base
mkdir -p ${ARIA2_CONF_DIR} ${SCRIPT_DIR} ${DOWNLOAD_DIR}
PROFILES="
aria2.conf
"
DOWNLOAD_PROFILE
[[ ! -f "${ARIA2_CONF_DIR}/aria2.session" ]] && {
rm -rf "${ARIA2_CONF_DIR}/aria2.session"
touch "${ARIA2_CONF_DIR}/aria2.session"
}
if ! [[ "${UPDATE_TRACKERS}" = "false" || "${UPDATE_TRACKERS}" = "disable" ]]; then
rm -f /etc/services.d/crond/down
PROFILES="tracker.sh"
DOWNLOAD_PROFILE
bash ${SCRIPT_DIR}/tracker.sh ${ARIA2_CONF}
else
touch /etc/services.d/crond/down
fi
exit 0

View File

@ -0,0 +1,35 @@
#!/usr/bin/with-contenv bash
# _ _ ____ ____
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
#
# https://github.com/P3TERX/Aria2-Pro-Docker
#
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
#
# This is free software, licensed under the MIT License.
# See /LICENSE for more information.
. /etc/init-base
INSTALL_RCLONE() {
if [[ ! -f /usr/local/bin/rclone ]]; then
echo
echo -e "${INFO} Installing RCLONE ..."
[[ -L /usr/bin/unzip ]] && rm -f /usr/bin/unzip
curl -fsSL https://rclone.org/install.sh | bash
fi
}
if [[ "${SPECIAL_MODE}" = "rclone" ]]; then
INSTALL_RCLONE
PROFILES="upload.sh rclone.env"
DOWNLOAD_PROFILE
elif [[ "${SPECIAL_MODE}" = "move" ]]; then
PROFILES="move.sh"
DOWNLOAD_PROFILE
fi
exit 0

View File

@ -0,0 +1,61 @@
#!/usr/bin/with-contenv bash
# _ _ ____ ____
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
#
# https://github.com/P3TERX/Aria2-Pro-Docker
#
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
#
# This is free software, licensed under the MIT License.
# See /LICENSE for more information.
. /etc/init-base
[[ -e ${ARIA2_CONF_DIR}/delete.sh ]] && {
rm -f ${ARIA2_CONF_DIR}/*.sh
sed -i "s@^\(on-download-stop=\).*@\1${SCRIPT_DIR}/delete.sh@" ${ARIA2_CONF}
sed -i "s@^\(on-download-complete=\).*@\1${SCRIPT_DIR}/clean.sh@" ${ARIA2_CONF}
}
sed -i "s@^\(dir=\).*@\1/downloads@" ${ARIA2_CONF}
sed -i "s@^\(input-file=\).*@\1${ARIA2_CONF_DIR}/aria2.session@" ${ARIA2_CONF}
sed -i "s@^\(save-session=\).*@\1${ARIA2_CONF_DIR}/aria2.session@" ${ARIA2_CONF}
sed -i "s@^\(dht-file-path=\).*@\1${ARIA2_CONF_DIR}/dht.dat@" ${ARIA2_CONF}
sed -i "s@^\(dht-file-path6=\).*@\1${ARIA2_CONF_DIR}/dht6.dat@" ${ARIA2_CONF}
[[ -e ${ARIA2_CONF_DIR}/HelloWorld ]] && exit 0
[[ ${RPC_PORT} ]] &&
sed -i "s@^\(rpc-listen-port=\).*@\1${RPC_PORT}@" ${ARIA2_CONF}
[[ ${LISTEN_PORT} ]] && {
sed -i "s@^\(listen-port=\).*@\1${LISTEN_PORT}@" ${ARIA2_CONF}
sed -i "s@^\(dht-listen-port=\).*@\1${LISTEN_PORT}@" ${ARIA2_CONF}
}
[[ ${RPC_SECRET} ]] &&
sed -i "s@^\(rpc-secret=\).*@\1${RPC_SECRET}@" ${ARIA2_CONF}
[[ ${DISK_CACHE} ]] &&
sed -i "s@^\(disk-cache=\).*@\1${DISK_CACHE}@" ${ARIA2_CONF}
[[ "${IPV6_MODE}" = "true" || "${IPV6_MODE}" = "enable" ]] && {
sed -i "s@^\(disable-ipv6=\).*@\1false@" ${ARIA2_CONF}
sed -i "s@^\(enable-dht6=\).*@\1true@" ${ARIA2_CONF}
}
[[ "${IPV6_MODE}" = "false" || "${IPV6_MODE}" = "disable" ]] && {
sed -i "s@^\(disable-ipv6=\).*@\1true@" ${ARIA2_CONF}
sed -i "s@^\(enable-dht6=\).*@\1false@" ${ARIA2_CONF}
}
[[ "${SPECIAL_MODE}" = "rclone" ]] &&
sed -i "s@^\(on-download-complete=\).*@\1${SCRIPT_DIR}/upload.sh@" ${ARIA2_CONF}
[[ "${SPECIAL_MODE}" = "move" ]] &&
sed -i "s@^\(on-download-complete=\).*@\1${SCRIPT_DIR}/move.sh@" ${ARIA2_CONF}
exit 0

View File

@ -0,0 +1,27 @@
#!/usr/bin/with-contenv bash
# _ _ ____ ____
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
#
# https://github.com/P3TERX/Aria2-Pro-Docker
#
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
#
# This is free software, licensed under the MIT License.
# See /LICENSE for more information.
. /etc/init-base
if [ -w ${DOWNLOAD_DIR} ]; then echo "Download DIR writeable, not changing owner."; else chown -R p3terx:p3terx ${DOWNLOAD_DIR}; fi
chown -R p3terx:p3terx ${ARIA2_CONF_DIR}
if [[ -z ${PUID} && -z ${PGID} ]] || [[ ${PUID} = 65534 && ${PGID} = 65534 ]]; then
echo -e "${WARN} Ignore permission settings."
chmod -v 777 ${DOWNLOAD_DIR}
chmod -vR 777 ${ARIA2_CONF_DIR}
else
if [ -w ${DOWNLOAD_DIR} ]; then echo "Download DIR writeable, not modifying permission."; else chmod -v u=rwx ${DOWNLOAD_DIR}; fi
chmod -v 600 ${ARIA2_CONF_DIR}/*
chmod -v 755 ${SCRIPT_DIR}
chmod -v 700 ${SCRIPT_DIR}/*
fi

View File

@ -0,0 +1,2 @@
#!/bin/sh
cat /Aria2-Pro

View File

@ -0,0 +1 @@
# BT tracker updates disabled.

View File

@ -0,0 +1,118 @@
# _ _ ____ ____
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
#
# https://github.com/P3TERX/Docker-Aria2-Pro
#
# Copyright (c) 2020 P3TERX <https://p3terx.com>
#
# This is free software, licensed under the MIT License.
# See /LICENSE for more information.
Green_font_prefix="\033[32m"
Red_font_prefix="\033[31m"
Green_background_prefix="\033[42;37m"
Red_background_prefix="\033[41;37m"
Font_color_suffix="\033[0m"
INFO="[${Green_font_prefix}INFO${Font_color_suffix}]"
ERROR="[${Red_font_prefix}ERROR${Font_color_suffix}]"
WARN="[${Yellow_font_prefix}WARN${Font_color_suffix}]"
DOWNLOAD_DIR="/downloads"
ARIA2_CONF_DIR="/config"
ARIA2_CONF="${ARIA2_CONF_DIR}/aria2.conf"
SCRIPT_CONF="${ARIA2_CONF_DIR}/script.conf"
SCRIPT_DIR="${ARIA2_CONF_DIR}/script"
CURL_OPTIONS="-fsSL --connect-timeout 3 --max-time 3"
PROFILE_URL1="https://p3terx.github.io/aria2.conf"
PROFILE_URL2="https://aria2c.now.sh"
PROFILE_URL3="https://cdn.jsdelivr.net/gh/P3TERX/aria2.conf"
FILE_ALLOCATION_SET() {
TMP_FILE="/downloads/P3TERX.COM"
if fallocate -l 5G ${TMP_FILE}; then
FILE_ALLOCATION=falloc
else
FILE_ALLOCATION=none
fi
rm -f ${TMP_FILE}
sed -i "s@^\(file-allocation=\).*@\1${FILE_ALLOCATION}@" "${ARIA2_CONF}"
}
CONVERSION_ARIA2_CONF() {
sed -i "s@^\(rpc-listen-port=\).*@\1${RPC_PORT:-6800}@" "${ARIA2_CONF}"
sed -i "s@^\(listen-port=\).*@\1${LISTEN_PORT:-6888}@" "${ARIA2_CONF}"
sed -i "s@^\(dht-listen-port=\).*@\1${LISTEN_PORT:-6888}@" "${ARIA2_CONF}"
sed -i "s@^\(dir=\).*@\1/downloads@" "${ARIA2_CONF}"
sed -i "s@/root/.aria2@${ARIA2_CONF_DIR}@" "${ARIA2_CONF}"
sed -i "s@^#\(retry-on-.*=\).*@\1true@" "${ARIA2_CONF}"
sed -i "s@^\(max-connection-per-server=\).*@\1128@" "${ARIA2_CONF}"
sed -i "/^on-download-stop=/d" "${ARIA2_CONF}"
sed -i "/^on-download-complete=/d" "${ARIA2_CONF}"
# Custom settings from user
sed -i "s@^\(continue=\).*@\1false@" "${ARIA2_CONF}"
sed -i "s@^\(always-resume=\).*@\1false@" "${ARIA2_CONF}"
sed -i "s@^\(max-concurrent-downloads=\).*@\1500@" "${ARIA2_CONF}"
sed -i "s@^\(enable-dht=\).*@\1false@" "${ARIA2_CONF}"
sed -i "s@^\(enable-dht6=\).*@\1false@" "${ARIA2_CONF}"
sed -i "s@^\(bt-enable-lpd=\).*@\1true@" "${ARIA2_CONF}"
sed -i "s@^\(enable-peer-exchange=\).*@\1false@" "${ARIA2_CONF}"
sed -i "s@^\(max-overall-upload-limit=\).*@\12M@" "${ARIA2_CONF}"
sed -i "s@^\(seed-time=\).*@\11@" "${ARIA2_CONF}"
sed -i "s@^\(user-agent=\).*@\1Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version@" "${ARIA2_CONF}"
sed -i "s@^\(peer-id-prefix=\).*@\1-DE13F0-@" "${ARIA2_CONF}"
sed -i "s@^\(summary-interval=\).*@\11@" "${ARIA2_CONF}"
sed -i "s@^\(show-console-readout=\).*@\1false@" "${ARIA2_CONF}"
sed -i "s@^\(console-log-level=\).*@\1notice@" "${ARIA2_CONF}"
# Add settings not present in default config
echo "" >>"${ARIA2_CONF}"
echo "# Custom settings added" >>"${ARIA2_CONF}"
echo "disable-metalink=true" >>"${ARIA2_CONF}"
echo "follow-torrent=false" >>"${ARIA2_CONF}"
echo "retry-on-400=false" >>"${ARIA2_CONF}"
echo "retry-on-403=false" >>"${ARIA2_CONF}"
echo "retry-on-406=false" >>"${ARIA2_CONF}"
echo "retry-on-unknown=true" >>"${ARIA2_CONF}"
echo "rpc-listen-all=true" >>"${ARIA2_CONF}"
[[ $TZ != "Asia/Shanghai" ]] && sed -i '11,$s/#.*//;/^$/d' "${ARIA2_CONF}"
FILE_ALLOCATION_SET
}
CONVERSION_SCRIPT_CONF() {
sed -i "s@\(upload-log=\).*@\1${ARIA2_CONF_DIR}/upload.log@" "${SCRIPT_CONF}"
sed -i "s@\(move-log=\).*@\1${ARIA2_CONF_DIR}/move.log@" "${SCRIPT_CONF}"
sed -i "s@^\(dest-dir=\).*@\1${DOWNLOAD_DIR}/completed@" "${SCRIPT_CONF}"
}
CONVERSION_CORE() {
sed -i "s@\(ARIA2_CONF_DIR=\"\).*@\1${ARIA2_CONF_DIR}\"@" "${SCRIPT_DIR}/core"
}
DOWNLOAD_PROFILE() {
for PROFILE in ${PROFILES}; do
[[ ${PROFILE} = *.sh || ${PROFILE} = core ]] && cd "${SCRIPT_DIR}" || cd "${ARIA2_CONF_DIR}"
while [[ ! -f ${PROFILE} ]]; do
rm -rf ${PROFILE}
echo
echo -e "${INFO} Downloading '${PROFILE}' ..."
curl -O ${CURL_OPTIONS} ${PROFILE_URL1}/${PROFILE} ||
curl -O ${CURL_OPTIONS} ${PROFILE_URL2}/${PROFILE} ||
curl -O ${CURL_OPTIONS} ${PROFILE_URL3}/${PROFILE}
[[ -s ${PROFILE} ]] && {
[[ "${PROFILE}" = "aria2.conf" ]] && CONVERSION_ARIA2_CONF
[[ "${PROFILE}" = "script.conf" ]] && CONVERSION_SCRIPT_CONF
[[ "${PROFILE}" = "core" ]] && CONVERSION_CORE
echo
echo -e "${INFO} '${PROFILE}' download completed !"
} || {
echo
echo -e "${ERROR} '${PROFILE}' download error, retry ..."
sleep 3
}
done
done
}

View File

@ -0,0 +1,15 @@
#!/usr/bin/execlineb -S0
# _ _ ____ ____
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
#
# https://github.com/P3TERX/Aria2-Pro-Docker
#
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
#
# This is free software, licensed under the MIT License.
# See /LICENSE for more information.
s6-svscanctl -t /var/run/s6/services

View File

@ -0,0 +1,18 @@
#!/usr/bin/with-contenv bash
# _ _ ____ ____
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
#
# https://github.com/P3TERX/Aria2-Pro-Docker
#
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
#
# This is free software, licensed under the MIT License.
# See /LICENSE for more information.
umask ${UMASK_SET:-022}
exec s6-setuidgid p3terx aria2c \
--conf-path=/config/aria2.conf

View File

@ -260,6 +260,37 @@ services:
- proxynet - proxynet
restart: always restart: always
aria2-pro:
container_name: aria2-pro
build:
context: "{{ airflow_worker_dir }}/aria2-pro-docker"
environment:
- PUID=${AIRFLOW_UID:-50000}
- PGID=0
- UMASK_SET=022
- RPC_SECRET={{ vault_aria2_rpc_secret }}
- RPC_PORT=6800
- LISTEN_PORT=6888
- DISK_CACHE=64M
- IPV6_MODE=false
- UPDATE_TRACKERS=false
- CUSTOM_TRACKER_URL=
- TZ=Asia/Shanghai
volumes:
- ${AIRFLOW_PROJ_DIR:-.}/aria2-config:/config
- ${AIRFLOW_PROJ_DIR:-.}/downloadfiles/videos/in-progress:/downloads
ports:
- "127.0.0.1:6800:6800"
- "6888:6888"
- "6888:6888/udp"
networks:
- proxynet
restart: unless-stopped
logging:
driver: json-file
options:
max-size: 1m
networks: networks:
proxynet: proxynet:
name: airflow_proxynet name: airflow_proxynet

View File

@ -132,6 +132,8 @@ services:
- "--comms-log-root-dir" - "--comms-log-root-dir"
- "/app/logs/yt-dlp-ops/communication_logs" - "/app/logs/yt-dlp-ops/communication_logs"
- "--bgutils-no-innertube" - "--bgutils-no-innertube"
- "--visitor-rotation-threshold"
- "250"
{% endif %} {% endif %}
restart: unless-stopped restart: unless-stopped
pull_policy: always pull_policy: always

View File

@ -327,7 +327,7 @@ def manage_system_callable(**context):
action = params["action"] action = params["action"]
# For Thrift actions, use the new management host/port # For Thrift actions, use the new management host/port
if entity not in ["airflow_meta", "activity_counters"]: if entity not in ["activity_counters"]:
host = params["management_host"] host = params["management_host"]
port = params["management_port"] port = params["management_port"]
else: else:
@ -343,7 +343,6 @@ def manage_system_callable(**context):
"account": ["list_with_status", "ban", "unban", "unban_all", "delete_from_redis"], "account": ["list_with_status", "ban", "unban", "unban_all", "delete_from_redis"],
"client": ["list_with_status", "delete_from_redis"], "client": ["list_with_status", "delete_from_redis"],
"accounts_and_proxies": ["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis"], "accounts_and_proxies": ["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis"],
"airflow_meta": ["clear_dag_runs"],
"activity_counters": ["list_with_status"], "activity_counters": ["list_with_status"],
} }
@ -364,41 +363,6 @@ def manage_system_callable(**context):
if action in ["ban", "unban"] and not account_id: if action in ["ban", "unban"] and not account_id:
raise ValueError(f"An 'account_id' is required for account action '{action}'.") raise ValueError(f"An 'account_id' is required for account action '{action}'.")
# --- Handle Airflow Meta actions separately as they don't use Thrift ---
if entity == "airflow_meta":
dag_id = params.get("dag_id_to_manage")
if action == "clear_dag_runs":
clear_scope = params.get("clear_scope")
logger.info(f"Attempting to delete DagRuns for DAG '{dag_id}' with scope '{clear_scope}'.")
with create_session() as session:
dag_run_query = session.query(DagRun).filter(DagRun.dag_id == dag_id)
if clear_scope == "last_run":
last_run = dag_run_query.order_by(DagRun.execution_date.desc()).first()
if not last_run:
logger.info(f"No runs found for DAG '{dag_id}'. Nothing to delete.")
print(f"\nNo runs found for DAG '{dag_id}'.\n")
return
logger.warning(f"Deleting last DagRun for DAG '{dag_id}' (run_id: {last_run.run_id}, execution_date: {last_run.execution_date}). This will also delete its task instances.")
# Deleting the DagRun object should cascade and delete related TaskInstances.
session.delete(last_run)
deleted_count = 1
else: # all_runs
logger.warning(f"Deleting ALL DagRuns and associated TaskInstances for DAG '{dag_id}'. This will remove all history from the UI.")
# To ensure all related data is cleared, we explicitly delete TaskInstances first.
# This is safer than relying on DB-level cascades which may not be configured.
ti_deleted_count = session.query(TaskInstance).filter(TaskInstance.dag_id == dag_id).delete(synchronize_session=False)
logger.info(f"Deleted {ti_deleted_count} TaskInstance records for DAG '{dag_id}'.")
deleted_count = dag_run_query.delete(synchronize_session=False)
# The session is committed automatically by the `with create_session()` context manager.
logger.info(f"Successfully deleted {deleted_count} DagRun(s) for DAG '{dag_id}'.")
print(f"\nSuccessfully deleted {deleted_count} DagRun(s) for DAG '{dag_id}'.\n")
return # End execution
# --- Handle Activity Counter action --- # --- Handle Activity Counter action ---
if entity == "activity_counters": if entity == "activity_counters":
@ -855,13 +819,13 @@ with DAG(
"entity": Param( "entity": Param(
"accounts_and_proxies", "accounts_and_proxies",
type="string", type="string",
enum=["account", "proxy", "client", "accounts_and_proxies", "activity_counters", "airflow_meta"], enum=["account", "proxy", "client", "accounts_and_proxies", "activity_counters"],
description="The type of entity to manage.", description="The type of entity to manage.",
), ),
"action": Param( "action": Param(
"list_with_status", "list_with_status",
type="string", type="string",
enum=["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis", "clear_dag_runs"], enum=["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis"],
description="""The management action to perform. description="""The management action to perform.
--- ---
#### Actions for `entity: proxy` #### Actions for `entity: proxy`
@ -895,10 +859,6 @@ with DAG(
- `unban_all`: Un-ban all proxies for a `server_identity` (or all servers) AND all accounts (optionally filtered by `account_id` as a prefix). - `unban_all`: Un-ban all proxies for a `server_identity` (or all servers) AND all accounts (optionally filtered by `account_id` as a prefix).
- `delete_from_redis`: Deletes both account and proxy status from Redis via Thrift service. For accounts, if `account_id` is provided as a prefix, it deletes all accounts matching that prefix. If `account_id` is empty, it deletes ALL accounts. For proxies, if `server_identity` is provided, it deletes all proxies for that server. If `server_identity` is empty, it deletes ALL proxies across all servers. - `delete_from_redis`: Deletes both account and proxy status from Redis via Thrift service. For accounts, if `account_id` is provided as a prefix, it deletes all accounts matching that prefix. If `account_id` is empty, it deletes ALL accounts. For proxies, if `server_identity` is provided, it deletes all proxies for that server. If `server_identity` is empty, it deletes ALL proxies across all servers.
#### Actions for `entity: airflow_meta`
- `clear_dag_runs`: **(Destructive)** Deletes DAG run history and associated task instances from the database, removing them from the UI. This allows the runs to be re-created if backfilling is enabled.
- `clear_scope: last_run`: Deletes only the most recent DAG run and its task instances.
- `clear_scope: all_runs`: Deletes all historical DAG runs and task instances for the selected DAG.
""", """,
), ),
"server_identity": Param( "server_identity": Param(
@ -922,20 +882,6 @@ with DAG(
title="Redis Connection ID", title="Redis Connection ID",
description="The Airflow connection ID for the Redis server (used for 'delete_from_redis' and for fetching detailed account status).", description="The Airflow connection ID for the Redis server (used for 'delete_from_redis' and for fetching detailed account status).",
), ),
"dag_id_to_manage": Param(
"ytdlp_ops_v01_worker_per_url",
type="string",
enum=["ytdlp_ops_v01_orchestrator", "ytdlp_ops_v01_dispatcher", "ytdlp_ops_v01_worker_per_url", "ytdlp_ops_v02_orchestrator_auth", "ytdlp_ops_v02_dispatcher_auth", "ytdlp_ops_v02_worker_per_url_auth", "ytdlp_ops_v02_orchestrator_dl", "ytdlp_ops_v02_dispatcher_dl", "ytdlp_ops_v02_worker_per_url_dl"],
title="[Airflow Meta] DAG ID",
description="The DAG ID to perform the action on.",
),
"clear_scope": Param(
"last_run",
type="string",
enum=["last_run", "all_runs"],
title="[Airflow Meta] Clear Scope",
description="For 'clear_dag_runs' action, specifies the scope of runs to clear.",
),
}, },
) as dag: ) as dag:
system_management_task = PythonOperator( system_management_task = PythonOperator(

View File

@ -15,7 +15,9 @@ from datetime import datetime
from airflow.exceptions import AirflowException from airflow.exceptions import AirflowException
from airflow.models.dag import DAG from airflow.models.dag import DAG
from airflow.models.dagrun import DagRun
from airflow.models.param import Param from airflow.models.param import Param
from airflow.models.taskinstance import TaskInstance
from airflow.operators.python import PythonOperator, BranchPythonOperator from airflow.operators.python import PythonOperator, BranchPythonOperator
from airflow.operators.empty import EmptyOperator from airflow.operators.empty import EmptyOperator
from airflow.operators.bash import BashOperator from airflow.operators.bash import BashOperator
@ -23,6 +25,7 @@ from airflow.providers.celery.executors.celery_executor import app as celery_app
from airflow.providers.redis.hooks.redis import RedisHook from airflow.providers.redis.hooks.redis import RedisHook
from airflow.utils.dates import days_ago from airflow.utils.dates import days_ago
from airflow.models.variable import Variable from airflow.models.variable import Variable
from airflow.utils.session import create_session
import requests import requests
# Configure logging # Configure logging
@ -276,7 +279,10 @@ def dump_redis_data_to_csv(redis_client, dump_dir, patterns):
def clear_queue_callable(**context): def clear_queue_callable(**context):
"""Dumps Redis data to CSV and/or clears specified Redis keys based on selection.""" """
Dumps Redis data to CSV and/or clears specified Redis keys based on selection.
The `_skipped` queue is for videos that are unavailable due to external reasons (e.g., private, removed).
"""
params = context['params'] params = context['params']
ti = context['task_instance'] ti = context['task_instance']
logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.") logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
@ -315,7 +321,7 @@ def clear_queue_callable(**context):
logger.info("Dumping is enabled. Performing dump before clearing.") logger.info("Dumping is enabled. Performing dump before clearing.")
dump_redis_data_to_csv(redis_client, dump_dir, dump_patterns) dump_redis_data_to_csv(redis_client, dump_dir, dump_patterns)
all_suffixes = ['_inbox', '_fail', '_result', '_progress'] all_suffixes = ['_inbox', '_fail', '_result', '_progress', '_skipped']
keys_to_delete = set() keys_to_delete = set()
for queue_base_name in queue_base_names_to_clear: for queue_base_name in queue_base_names_to_clear:
if '_all' in queues_to_clear_options: if '_all' in queues_to_clear_options:
@ -420,7 +426,10 @@ def list_contents_callable(**context):
def check_status_callable(**context): def check_status_callable(**context):
"""Checks the status (type and size) of all standard Redis queues for a given base name.""" """
Checks the status (type and size) of all standard Redis queues for a given base name.
The `_skipped` queue is for videos that are unavailable due to external reasons (e.g., private, removed).
"""
params = context['params'] params = context['params']
ti = context['task_instance'] ti = context['task_instance']
logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.") logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
@ -436,7 +445,7 @@ def check_status_callable(**context):
else: else:
raise ValueError(f"Invalid queue_system: {queue_system}") raise ValueError(f"Invalid queue_system: {queue_system}")
queue_suffixes = ['_inbox', '_progress', '_result', '_fail'] queue_suffixes = ['_inbox', '_progress', '_result', '_fail', '_skipped']
logger.info(f"--- Checking Status for Queue System: '{queue_system}' ---") logger.info(f"--- Checking Status for Queue System: '{queue_system}' ---")
@ -575,6 +584,56 @@ def purge_celery_queue_callable(**context):
logger.info("--- Purge complete. ---") logger.info("--- Purge complete. ---")
def clear_dag_runs_callable(**context):
"""
Deletes DAG run history and associated task instances from the database.
"""
params = context['params']
dag_id = params.get("dag_id_to_manage")
clear_scope = params.get("clear_scope")
log_target = f"DAG '{dag_id}'" if dag_id != "ALL_DAGS" else "ALL DAGS (except ytdlp_mgmt_queues)"
logger.info(f"Attempting to delete DagRuns for {log_target} with scope '{clear_scope}'.")
with create_session() as session:
dag_run_query = session.query(DagRun)
if dag_id == "ALL_DAGS":
dag_run_query = dag_run_query.filter(DagRun.dag_id != 'ytdlp_mgmt_queues')
else:
dag_run_query = dag_run_query.filter(DagRun.dag_id == dag_id)
if clear_scope == "last_run":
if dag_id == "ALL_DAGS":
raise AirflowException("Cannot clear 'last_run' for ALL_DAGS. Please select a specific DAG.")
last_run = dag_run_query.order_by(DagRun.execution_date.desc()).first()
if not last_run:
logger.info(f"No runs found for DAG '{dag_id}'. Nothing to delete.")
print(f"\nNo runs found for DAG '{dag_id}'.\n")
return
logger.warning(f"Deleting last DagRun for DAG '{dag_id}' (run_id: {last_run.run_id}, execution_date: {last_run.execution_date}). This will also delete its task instances.")
session.delete(last_run)
deleted_count = 1
else: # all_runs
logger.warning(f"Deleting ALL DagRuns and associated TaskInstances for {log_target}. This will remove all history from the UI.")
ti_query = session.query(TaskInstance)
if dag_id == "ALL_DAGS":
ti_query = ti_query.filter(TaskInstance.dag_id != 'ytdlp_mgmt_queues')
else:
ti_query = ti_query.filter(TaskInstance.dag_id == dag_id)
ti_deleted_count = ti_query.delete(synchronize_session=False)
logger.info(f"Deleted {ti_deleted_count} TaskInstance records for {log_target}.")
deleted_count = dag_run_query.delete(synchronize_session=False)
# The session is committed automatically by the `with create_session()` context manager.
logger.info(f"Successfully deleted {deleted_count} DagRun(s) for {log_target}.")
print(f"\nSuccessfully deleted {deleted_count} DagRun(s) for {log_target}.\n")
def add_videos_to_queue_callable(**context): def add_videos_to_queue_callable(**context):
""" """
Parses video inputs from manual text, a predefined file, or a file path/URL, Parses video inputs from manual text, a predefined file, or a file path/URL,
@ -671,12 +730,13 @@ with DAG(
- `check_status`: Check the overall status of the queues. - `check_status`: Check the overall status of the queues.
- `requeue_failed`: Copy all URLs from the `_fail` hash to the `_inbox` list and clear the `_fail` hash. - `requeue_failed`: Copy all URLs from the `_fail` hash to the `_inbox` list and clear the `_fail` hash.
- `purge_celery_queue`: **(Destructive)** Removes all tasks from a specified Celery worker queue (e.g., `queue-dl`). This is useful for clearing out a backlog of tasks that were queued before a dispatcher was paused. - `purge_celery_queue`: **(Destructive)** Removes all tasks from a specified Celery worker queue (e.g., `queue-dl`). This is useful for clearing out a backlog of tasks that were queued before a dispatcher was paused.
- `clear_dag_runs`: **(Destructive)** Deletes DAG run history and associated task instances from the database, removing them from the UI.
""", """,
params={ params={
"action": Param( "action": Param(
"list_contents", "list_contents",
type="string", type="string",
enum=["add_videos", "clear_queue", "list_contents", "check_status", "requeue_failed", "inspect_celery_cluster", "purge_celery_queue"], enum=["add_videos", "clear_queue", "list_contents", "check_status", "requeue_failed", "inspect_celery_cluster", "purge_celery_queue", "clear_dag_runs"],
title="Action", title="Action",
description="The management action to perform.", description="The management action to perform.",
), ),
@ -737,7 +797,7 @@ with DAG(
description="Select which standard queues to clear. '_all' clears all four. If left empty, it defaults to '_all'.", description="Select which standard queues to clear. '_all' clears all four. If left empty, it defaults to '_all'.",
items={ items={
"type": "string", "type": "string",
"enum": ["_inbox", "_fail", "_result", "_progress", "_all"], "enum": ["_inbox", "_fail", "_result", "_progress", "_skipped", "_all"],
} }
), ),
"confirm_clear": Param( "confirm_clear": Param(
@ -766,7 +826,7 @@ with DAG(
), ),
# --- Params for 'list_contents' --- # --- Params for 'list_contents' ---
"queue_to_list": Param( "queue_to_list": Param(
'video_queue_inbox,queue2_auth_inbox,queue2_dl_result', 'video_queue_inbox,queue2_auth_inbox,queue2_dl_inbox,queue2_dl_result',
type="string", type="string",
title="[list_contents] Queues to List", title="[list_contents] Queues to List",
description="Comma-separated list of exact Redis key names to list.", description="Comma-separated list of exact Redis key names to list.",
@ -797,6 +857,21 @@ with DAG(
title="[purge_celery_queue] Confirm Purge", title="[purge_celery_queue] Confirm Purge",
description="Must be set to True to execute the 'purge_celery_queue' action. This is a destructive operation that removes all tasks from the specified Celery queue(s).", description="Must be set to True to execute the 'purge_celery_queue' action. This is a destructive operation that removes all tasks from the specified Celery queue(s).",
), ),
# --- Params for 'clear_dag_runs' ---
"dag_id_to_manage": Param(
"ALL_DAGS",
type="string",
enum=["ALL_DAGS", "ytdlp_ops_v01_orchestrator", "ytdlp_ops_v01_dispatcher", "ytdlp_ops_v01_worker_per_url", "ytdlp_ops_v02_orchestrator_auth", "ytdlp_ops_v02_dispatcher_auth", "ytdlp_ops_v02_worker_per_url_auth", "ytdlp_ops_v02_orchestrator_dl", "ytdlp_ops_v02_dispatcher_dl", "ytdlp_ops_v02_worker_per_url_dl"],
title="[clear_dag_runs] DAG ID",
description="The DAG ID to perform the action on. Select 'ALL_DAGS' to clear history for all DAGs.",
),
"clear_scope": Param(
"all_runs",
type="string",
enum=["last_run", "all_runs"],
title="[clear_dag_runs] Clear Scope",
description="For 'clear_dag_runs' action, specifies the scope of runs to clear.",
),
# --- Common Params --- # --- Common Params ---
"redis_conn_id": Param( "redis_conn_id": Param(
DEFAULT_REDIS_CONN_ID, DEFAULT_REDIS_CONN_ID,
@ -866,6 +941,11 @@ with DAG(
python_callable=purge_celery_queue_callable, python_callable=purge_celery_queue_callable,
) )
action_clear_dag_runs = PythonOperator(
task_id="action_clear_dag_runs",
python_callable=clear_dag_runs_callable,
)
# --- Wire up tasks --- # --- Wire up tasks ---
branch_on_action >> [ branch_on_action >> [
action_add_videos, action_add_videos,
@ -875,4 +955,5 @@ with DAG(
action_requeue_failed, action_requeue_failed,
action_inspect_celery_cluster, action_inspect_celery_cluster,
action_purge_celery_queue, action_purge_celery_queue,
action_clear_dag_runs,
] ]

View File

@ -20,7 +20,7 @@ from airflow.utils.dates import days_ago
from airflow.api.common.trigger_dag import trigger_dag from airflow.api.common.trigger_dag import trigger_dag
from airflow.models.dagrun import DagRun from airflow.models.dagrun import DagRun
from airflow.models.dag import DagModel from airflow.models.dag import DagModel
from datetime import timedelta from datetime import timedelta, datetime
import logging import logging
import random import random
import time import time
@ -37,41 +37,6 @@ from thrift.transport import TSocket, TTransport
# Configure logging # Configure logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DEFAULT_REQUEST_PARAMS_JSON = """{
"context_reuse_policy": {
"enabled": true,
"max_age_seconds": 86400,
"reuse_visitor_id": true,
"reuse_cookies": true
},
"token_generation_strategy": {
"youtubei_js": {
"generate_po_token": true,
"generate_gvs_token": true
}
},
"ytdlp_params": {
"use_curl_prefetch": false,
"token_supplement_strategy": {
"youtubepot_bgutilhttp_extractor": {
"enabled": true
}
},
"visitor_id_override": {
"enabled": true
}
},
"session_params": {
"lang": "en-US",
"location": "US",
"deviceCategory": "MOBILE",
"user_agents": {
"youtubei_js": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)",
"yt_dlp": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)"
}
}
}"""
# Default settings # Default settings
DEFAULT_QUEUE_NAME = 'video_queue' DEFAULT_QUEUE_NAME = 'video_queue'
DEFAULT_REDIS_CONN_ID = 'redis_default' DEFAULT_REDIS_CONN_ID = 'redis_default'
@ -191,6 +156,17 @@ def orchestrate_workers_ignition_callable(**context):
dag_run_id = context['dag_run'].run_id dag_run_id = context['dag_run'].run_id
total_triggered = 0 total_triggered = 0
# --- Generate a consistent timestamped prefix for this orchestrator run ---
# This ensures all workers spawned from this run use the same set of accounts.
final_account_pool_prefix = params['account_pool']
if params.get('prepend_client_to_account') and params.get('account_pool_size') is not None:
clients_str = params.get('clients', '')
primary_client = clients_str.split(',')[0].strip() if clients_str else 'unknown'
# Use a timestamp from the orchestrator's run for consistency
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
final_account_pool_prefix = f"{params['account_pool']}_{timestamp}_{primary_client}"
logger.info(f"Generated consistent account prefix for this run: '{final_account_pool_prefix}'")
for i, bunch in enumerate(bunches): for i, bunch in enumerate(bunches):
logger.info(f"--- Triggering Bunch {i+1}/{len(bunches)} (contains {len(bunch)} dispatcher(s)) ---") logger.info(f"--- Triggering Bunch {i+1}/{len(bunches)} (contains {len(bunch)} dispatcher(s)) ---")
for j, _ in enumerate(bunch): for j, _ in enumerate(bunch):
@ -199,6 +175,8 @@ def orchestrate_workers_ignition_callable(**context):
# Pass all orchestrator params to the dispatcher, which will then pass them to the worker. # Pass all orchestrator params to the dispatcher, which will then pass them to the worker.
conf_to_pass = {p: params[p] for p in params} conf_to_pass = {p: params[p] for p in params}
# Override account_pool with the generated prefix
conf_to_pass['account_pool'] = final_account_pool_prefix
logger.info(f"Triggering dispatcher {j+1}/{len(bunch)} in bunch {i+1} (run {total_triggered + 1}/{total_workers}) (Run ID: {run_id})") logger.info(f"Triggering dispatcher {j+1}/{len(bunch)} in bunch {i+1} (run {total_triggered + 1}/{total_workers}) (Run ID: {run_id})")
logger.debug(f"Full conf for dispatcher run {run_id}: {conf_to_pass}") logger.debug(f"Full conf for dispatcher run {run_id}: {conf_to_pass}")
@ -343,18 +321,13 @@ with DAG(
"'proceed_loop': (Default) Mark URL as failed but continue the processing loop with a new URL. " "'proceed_loop': (Default) Mark URL as failed but continue the processing loop with a new URL. "
"'retry_with_new_token': Attempt to get a new token with a new account and retry the download once. If it fails again, proceed loop." "'retry_with_new_token': Attempt to get a new token with a new account and retry the download once. If it fails again, proceed loop."
), ),
'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."), 'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with per-request parameters to override server defaults. Can be a full JSON object or comma-separated key=value pairs (e.g., 'session_params.location=DE,ytdlp_params.skip_cache=true')."),
'language_code': Param('en-US', type="string", title="[Worker Param] Language Code", description="The language code (e.g., 'en-US', 'de-DE') to use for the YouTube request headers."),
'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."), 'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."),
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."), 'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
'clients': Param( 'clients': Param(
'tv_simply', 'tv_simply',
type="string", type="string",
enum=[
'tv_simply',
'mweb',
'tv',
'custom',
],
title="[Worker Param] Clients", title="[Worker Param] Clients",
description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details." description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details."
), ),
@ -370,23 +343,16 @@ with DAG(
'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."), 'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."),
'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."), 'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."),
'skip_probe': Param(True, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."), 'skip_probe': Param(True, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
'yt_dlp_cleanup_mode': Param(True, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."), 'yt_dlp_cleanup_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."), 'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."),
'download_format_preset': Param( 'download_format': Param(
'format_1', 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
type="string", type="string",
enum=['format_1', 'format_2', 'custom'], title="[Worker Param] Download Format",
title="[Worker Param] Download Format Preset", description="Custom yt-dlp format string. Common presets: [1] 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' (Default, best quality MP4). [2] '18-dashy/18,140-dashy/140,133-dashy/134-dashy/136-dashy/137-dashy/250-dashy/298-dashy/299-dashy' (Legacy formats). [3] '299-dashy/298-dashy/250-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy' (High-framerate formats)."
description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformat_1: 18-dashy/18,140-dashy/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformat_2: (299/298/137/136/135/134/133)-dashy"
),
'download_format_custom': Param(
'18-dashy/18,140-dashy/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy',
type="string",
title="[Worker Param] Custom Download Format",
description="Custom yt-dlp format string. Used when preset is 'custom'. E.g., 'ba[ext=m4a]/bestaudio/best'."
), ),
'downloader': Param( 'downloader': Param(
'py', 'cli',
type="string", type="string",
enum=['py', 'aria-rpc', 'cli'], enum=['py', 'aria-rpc', 'cli'],
title="[Worker Param] Download Tool", title="[Worker Param] Download Tool",
@ -396,7 +362,7 @@ with DAG(
'aria_port': Param(6800, type="integer", title="[Worker Param] Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_PORT'."), 'aria_port': Param(6800, type="integer", title="[Worker Param] Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_PORT'."),
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="[Worker Param] Aria2c Secret", description="For 'aria-rpc' downloader: Secret token. Can be set via Airflow Variable 'YTDLP_ARIA_SECRET'."), 'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="[Worker Param] Aria2c Secret", description="For 'aria-rpc' downloader: Secret token. Can be set via Airflow Variable 'YTDLP_ARIA_SECRET'."),
'yt_dlp_extra_args': Param( 'yt_dlp_extra_args': Param(
'--no-resize-buffer --buffer-size 4M --min-sleep-interval 5 --max-sleep-interval 10', '',
type=["string", "null"], type=["string", "null"],
title="[Worker Param] Extra yt-dlp arguments", title="[Worker Param] Extra yt-dlp arguments",
), ),

View File

@ -17,7 +17,7 @@ from __future__ import annotations
from airflow.decorators import task, task_group from airflow.decorators import task, task_group
from airflow.exceptions import AirflowException, AirflowSkipException from airflow.exceptions import AirflowException, AirflowSkipException
from airflow.models import Variable from airflow.models import Variable
from airflow.models.dag import DAG from airflow.models.dag import DAG, DagModel
from airflow.models.param import Param from airflow.models.param import Param
from airflow.models.xcom_arg import XComArg from airflow.models.xcom_arg import XComArg
from airflow.operators.dummy import DummyOperator from airflow.operators.dummy import DummyOperator
@ -174,14 +174,9 @@ def _get_account_pool(params: dict) -> list:
is_prefix_mode = True is_prefix_mode = True
pool_size = int(pool_size_param) pool_size = int(pool_size_param)
if params.get('prepend_client_to_account', True): # The orchestrator now generates the full prefix if prepend_client_to_account is True.
clients_str = params.get('clients', '') # The worker just appends the numbers.
primary_client = clients_str.split(',')[0].strip() if clients_str else 'unknown' accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
new_prefix = f"{prefix}_{timestamp}_{primary_client}"
accounts = [f"{new_prefix}_{i:02d}" for i in range(1, pool_size + 1)]
else:
accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
else: else:
accounts = [prefix] accounts = [prefix]
@ -258,12 +253,26 @@ def get_url_and_assign_account(**context):
# For manual runs, we fall back to 'manual_url_to_process'. # For manual runs, we fall back to 'manual_url_to_process'.
url_to_process = params.get('url_to_process') url_to_process = params.get('url_to_process')
if not url_to_process: if not url_to_process:
url_to_process = params.get('manual_url_to_process') manual_url_input = params.get('manual_url_to_process')
if url_to_process: if manual_url_input:
logger.info(f"Using URL from manual run parameter: '{url_to_process}'") logger.info(f"Using URL from manual run parameter: '{manual_url_input}'")
if manual_url_input == 'PULL_FROM_QUEUE':
logger.info("Manual run is set to pull from queue.")
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
queue_name = params.get('queue_name', DEFAULT_QUEUE_NAME)
inbox_queue = f"{queue_name}_inbox"
client = _get_redis_client(redis_conn_id)
url_bytes = client.lpop(inbox_queue)
if not url_bytes:
logger.info("Redis queue is empty. No work to do. Skipping task.")
raise AirflowSkipException("Redis queue is empty. No work to do.")
url_to_process = url_bytes.decode('utf-8')
logger.info(f"Pulled URL '{url_to_process}' from queue '{inbox_queue}'.")
else:
url_to_process = manual_url_input
if not url_to_process: if not url_to_process:
raise AirflowException("No URL to process. For manual runs, please provide a URL in the 'manual_url_to_process' parameter.") raise AirflowException("No URL to process. For manual runs, please provide a URL in the 'manual_url_to_process' parameter, or 'PULL_FROM_QUEUE'.")
logger.info(f"Received URL '{url_to_process}' to process.") logger.info(f"Received URL '{url_to_process}' to process.")
# Mark the URL as in-progress in Redis # Mark the URL as in-progress in Redis
@ -310,9 +319,26 @@ def get_token(initial_data: dict, **context):
host, port = params['service_ip'], int(params['service_port']) host, port = params['service_ip'], int(params['service_port'])
machine_id = params.get('machine_id') or socket.gethostname() machine_id = params.get('machine_id') or socket.gethostname()
clients = params.get('clients') clients = params.get('clients')
request_params_json = params.get('request_params_json', '{}') request_params_json = params.get('request_params_json')
language_code = params.get('language_code')
assigned_proxy_url = params.get('assigned_proxy_url') assigned_proxy_url = params.get('assigned_proxy_url')
if language_code:
try:
params_dict = json.loads(request_params_json)
logger.info(f"Setting language for request: {language_code}")
if 'session_params' not in params_dict:
params_dict['session_params'] = {}
params_dict['session_params']['lang'] = language_code
request_params_json = json.dumps(params_dict)
except (json.JSONDecodeError, TypeError):
logger.warning("Could not parse request_params_json as JSON. Treating as key=value pairs and appending language code.")
lang_kv = f"session_params.lang={language_code}"
if request_params_json:
request_params_json += f",{lang_kv}"
else:
request_params_json = lang_kv
video_id = _extract_video_id(url) video_id = _extract_video_id(url)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
job_dir_name = f"{timestamp}-{video_id or 'unknown'}" job_dir_name = f"{timestamp}-{video_id or 'unknown'}"
@ -355,18 +381,39 @@ def get_token(initial_data: dict, **context):
if process.returncode != 0: if process.returncode != 0:
error_message = "ytops-client failed. See logs for details." error_message = "ytops-client failed. See logs for details."
for line in reversed(process.stderr.strip().split('\n')): # Try to find a more specific error message from the Thrift client's output
if 'ERROR' in line or 'Thrift error' in line or 'Connection to server failed' in line: thrift_error_match = re.search(r'A Thrift error occurred: (.*)', process.stderr)
error_message = line.strip() if thrift_error_match:
break error_message = thrift_error_match.group(1).strip()
else: # Fallback to old line-by-line parsing
for line in reversed(process.stderr.strip().split('\n')):
if 'ERROR' in line or 'Thrift error' in line or 'Connection to server failed' in line:
error_message = line.strip()
break
# Determine error code for branching logic
error_code = 'GET_INFO_CLIENT_FAIL' error_code = 'GET_INFO_CLIENT_FAIL'
if "BOT_DETECTED" in process.stderr: stderr_lower = process.stderr.lower()
error_code = "BOT_DETECTED"
elif "BOT_DETECTION_SIGN_IN_REQUIRED" in process.stderr: # These patterns should match the error codes from PBUserException and others
error_code = "BOT_DETECTION_SIGN_IN_REQUIRED" error_patterns = {
elif "Connection to server failed" in process.stderr: "BOT_DETECTED": ["bot_detected"],
error_code = "TRANSPORT_ERROR" "BOT_DETECTION_SIGN_IN_REQUIRED": ["bot_detection_sign_in_required"],
"TRANSPORT_ERROR": ["connection to server failed"],
"PRIVATE_VIDEO": ["private video"],
"COPYRIGHT_REMOVAL": ["copyright"],
"GEO_RESTRICTED": ["in your country"],
"VIDEO_REMOVED": ["video has been removed"],
"VIDEO_UNAVAILABLE": ["video unavailable"],
"MEMBERS_ONLY": ["members-only"],
"AGE_GATED_SIGN_IN": ["sign in to confirm your age"],
"VIDEO_PROCESSING": ["processing this video"],
}
for code, patterns in error_patterns.items():
if any(p in stderr_lower for p in patterns):
error_code = code
break # Found a match, stop searching
error_details = { error_details = {
'error_message': error_message, 'error_message': error_message,
@ -381,8 +428,23 @@ def get_token(initial_data: dict, **context):
if proxy_match: if proxy_match:
proxy = proxy_match.group(1).strip() proxy = proxy_match.group(1).strip()
# Rename the info.json to include the proxy for the download worker
final_info_json_path = info_json_path
if proxy:
# Sanitize for filename: replace '://' which is invalid in paths. Colons are usually fine.
sanitized_proxy = proxy.replace('://', '---')
new_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}_proxy_{sanitized_proxy}.json"
new_path = os.path.join(job_dir_path, new_filename)
try:
os.rename(info_json_path, new_path)
final_info_json_path = new_path
logger.info(f"Renamed info.json to include proxy: {new_path}")
except OSError as e:
logger.error(f"Failed to rename info.json to include proxy: {e}. Using original path.")
return { return {
'info_json_path': info_json_path, 'info_json_path': final_info_json_path,
'job_dir_path': job_dir_path, 'job_dir_path': job_dir_path,
'socks_proxy': proxy, 'socks_proxy': proxy,
'ytdlp_command': None, 'ytdlp_command': None,
@ -407,10 +469,15 @@ def handle_bannable_error_branch(task_id_to_check: str, **context):
error_code = error_details.get('error_code', '').strip() error_code = error_details.get('error_code', '').strip()
policy = params.get('on_auth_failure', 'retry_with_new_account') policy = params.get('on_auth_failure', 'retry_with_new_account')
# Check if this is an age confirmation error - should not stop the loop # Unrecoverable video errors that should not be retried or treated as system failures.
if "Sign in to confirm your age" in error_message or "confirm your age" in error_message.lower(): unrecoverable_video_errors = [
logger.info(f"Age confirmation error detected for '{task_id_to_check}'. This is a content restriction, not a bot detection issue.") "AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "COPYRIGHT_REMOVAL",
return 'handle_age_restriction_error' "GEO_RESTRICTED", "VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED"
]
if error_code in unrecoverable_video_errors:
logger.warning(f"Unrecoverable video error '{error_code}' detected for '{task_id_to_check}'. This is a content issue, not a system failure.")
return 'handle_unrecoverable_video_error'
# Fatal Thrift connection errors that should stop all processing. # Fatal Thrift connection errors that should stop all processing.
if error_code == 'TRANSPORT_ERROR': if error_code == 'TRANSPORT_ERROR':
@ -646,6 +713,65 @@ def list_available_formats(token_data: dict, **context):
return [] return []
def _resolve_generic_selector(selector: str, info_json_path: str, logger) -> str | list[str] | None:
"""
Uses yt-dlp to resolve a generic format selector into specific, numeric format ID(s).
Returns a numeric selector string (e.g., '18'), a list of IDs for '+' selectors
(e.g., ['299', '140']), or None if resolution fails.
"""
import subprocess
import shlex
try:
cmd = [
'yt-dlp',
'--print', 'format_id',
'-f', selector,
'--load-info-json', info_json_path,
]
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
logger.info(f"Resolving generic selector '{selector}' with command: {copy_paste_cmd}")
process = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
if process.stderr:
# yt-dlp often prints warnings to stderr that are not fatal.
# e.g., "Requested format selector '...' contains no available formats"
logger.info(f"yt-dlp resolver STDERR for selector '{selector}':\n{process.stderr}")
if process.returncode != 0:
logger.error(f"yt-dlp resolver for selector '{selector}' failed with exit code {process.returncode}")
return None
output_ids = process.stdout.strip().split('\n')
output_ids = [fid for fid in output_ids if fid] # Remove empty lines
if not output_ids:
logger.warning(f"Selector '{selector}' resolved to no format IDs.")
return None
# yt-dlp might return '137+140' on one line, or '137\n140' on multiple.
# We need to handle both to get individual IDs.
final_ids = []
for fid in output_ids:
final_ids.extend(fid.split('+'))
# If the original selector was for merging (contained '+'), return individual IDs for separate downloads.
# Otherwise, yt-dlp has already chosen the best one from a fallback list, so we just use it.
if '+' in selector:
resolved_selector = final_ids
else:
resolved_selector = final_ids[0] # yt-dlp gives the single best choice
logger.info(f"Successfully resolved selector '{selector}' to '{resolved_selector}'.")
return resolved_selector
except Exception as e:
logger.error(f"An error occurred while resolving selector '{selector}': {e}", exc_info=True)
return None
@task @task
def download_and_probe(token_data: dict, available_formats: list[str], **context): def download_and_probe(token_data: dict, available_formats: list[str], **context):
""" """
@ -660,26 +786,33 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
try: try:
params = context['params'] params = context['params']
info_json_path = token_data.get('info_json_path') info_json_path = token_data.get('info_json_path')
proxy = token_data.get('socks_proxy')
original_url = token_data.get('original_url') original_url = token_data.get('original_url')
# Extract proxy from filename, with fallback to token_data for backward compatibility
proxy = None
if info_json_path:
filename = os.path.basename(info_json_path)
proxy_match = re.search(r'_proxy_(.+)\.json$', filename)
if proxy_match:
sanitized_proxy = proxy_match.group(1)
# Reverse sanitization from auth worker (replace '---' with '://')
proxy = sanitized_proxy.replace('---', '://')
logger.info(f"Extracted proxy '{proxy}' from filename.")
if not proxy:
logger.warning("Proxy not found in filename. Falling back to 'socks_proxy' from token_data.")
proxy = token_data.get('socks_proxy')
download_dir = token_data.get('job_dir_path') download_dir = token_data.get('job_dir_path')
if not download_dir: if not download_dir:
# Fallback for older runs or if job_dir_path is missing # Fallback for older runs or if job_dir_path is missing
download_dir = os.path.dirname(info_json_path) download_dir = os.path.dirname(info_json_path)
format_preset = params.get('download_format_preset', 'format_1') download_format = params.get('download_format')
if format_preset == 'custom': if not download_format:
download_format = params.get('download_format_custom') raise AirflowException("The 'download_format' parameter is missing or empty.")
if not download_format:
raise AirflowException("Format preset is 'custom' but no custom format string was provided.")
elif format_preset == 'format_1':
download_format = '18-dashy/18,140-dashy/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy'
elif format_preset == 'format_2':
download_format = '(299/298/137/136/135/134/133)-dashy'
else:
download_format = '18-dashy/18,140-dashy/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy'
output_template = params.get('output_path_template', "%(title)s [%(id)s].f%(format_id)s.%(ext)s") output_template = params.get('output_path_template', "%(id)s.f%(format_id)s.%(ext)s")
full_output_path = os.path.join(download_dir, output_template) full_output_path = os.path.join(download_dir, output_template)
retry_on_probe_failure = params.get('retry_on_probe_failure', False) retry_on_probe_failure = params.get('retry_on_probe_failure', False)
@ -706,15 +839,16 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
downloader = params.get('downloader', 'py') downloader = params.get('downloader', 'py')
cmd = ['ytops-client', 'download', downloader, '--load-info-json', info_json_path, '-f', format_selector] cmd = ['ytops-client', 'download', downloader, '--load-info-json', info_json_path, '-f', format_selector]
if proxy:
cmd.extend(['--proxy', proxy])
if downloader == 'py': if downloader == 'py':
if proxy:
cmd.extend(['--proxy', proxy])
cmd.extend(['--output-dir', download_dir]) cmd.extend(['--output-dir', download_dir])
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args # The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args # The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
py_extra_args = [] py_extra_args = ['--output', output_template, '--no-resize-buffer', '--buffer-size', '4M']
if params.get('fragment_retries'):
py_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])])
if params.get('socket_timeout'): if params.get('socket_timeout'):
py_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])]) py_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
if params.get('yt_dlp_test_mode'): if params.get('yt_dlp_test_mode'):
@ -727,12 +861,29 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
cmd.extend(['--extra-ytdlp-args', final_extra_args_str]) cmd.extend(['--extra-ytdlp-args', final_extra_args_str])
elif downloader == 'aria-rpc': elif downloader == 'aria-rpc':
# For aria2c running on the host, the proxy (if also on the host) should be referenced via localhost.
# The user-agent is set by yt-dlp's extractor, not directly here. The default is Cobalt-based.
if proxy:
proxy_port_match = re.search(r':(\d+)$', proxy)
if proxy_port_match:
proxy_port = proxy_port_match.group(1)
aria_proxy = f"socks5://127.0.0.1:{proxy_port}"
cmd.extend(['--proxy', aria_proxy])
logger.info(f"Using translated proxy for host-based aria2c: {aria_proxy}")
else:
logger.warning(f"Could not parse port from proxy '{proxy}'. Passing it to aria2c as-is.")
cmd.extend(['--proxy', proxy])
# The remote-dir is the path relative to aria2c's working directory on the host.
# The output-dir is the container's local path to the same shared volume.
remote_dir = os.path.relpath(download_dir, '/opt/airflow/downloadfiles/videos')
cmd.extend([ cmd.extend([
'--aria-host', params.get('aria_host', '172.17.0.1'), '--aria-host', params.get('aria_host', '172.17.0.1'),
'--aria-port', str(params.get('aria_port', 6800)), '--aria-port', str(params.get('aria_port', 6800)),
'--aria-secret', params.get('aria_secret'), '--aria-secret', params.get('aria_secret'),
'--wait', '--wait',
'--output-dir', download_dir, '--output-dir', download_dir,
'--remote-dir', remote_dir,
]) ])
if 'dashy' in format_selector: if 'dashy' in format_selector:
cmd.extend([ cmd.extend([
@ -743,9 +894,15 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
cmd.append('--cleanup') cmd.append('--cleanup')
elif downloader == 'cli': elif downloader == 'cli':
cmd.extend(['--output-dir', download_dir]) # Overwrite cmd to call yt-dlp directly
cmd = ['yt-dlp', '--load-info-json', info_json_path, '-f', format_selector]
if proxy:
cmd.extend(['--proxy', proxy])
# The 'cli' tool is the old yt-dlp wrapper, so it takes similar arguments. # The 'cli' tool is the old yt-dlp wrapper, so it takes similar arguments.
cli_extra_args = [] cli_extra_args = ['--output', full_output_path, '--no-resize-buffer', '--buffer-size', '4M']
if params.get('fragment_retries'):
cli_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])])
if params.get('socket_timeout'): if params.get('socket_timeout'):
cli_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])]) cli_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
if params.get('yt_dlp_test_mode'): if params.get('yt_dlp_test_mode'):
@ -754,11 +911,12 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '') existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '')
final_extra_args = existing_extra + cli_extra_args final_extra_args = existing_extra + cli_extra_args
if final_extra_args: if final_extra_args:
cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)]) cmd.extend(final_extra_args)
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd) copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
logger.info(f"--- Preparing to execute ytops-client ---") tool_name = 'yt-dlp' if downloader == 'cli' else 'ytops-client'
logger.info(f"Full ytops-client command for format '{format_selector}':") logger.info(f"--- Preparing to execute {tool_name} ---")
logger.info(f"Full {tool_name} command for format '{format_selector}':")
logger.info(copy_paste_cmd) logger.info(copy_paste_cmd)
logger.info(f"-----------------------------------------") logger.info(f"-----------------------------------------")
process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600) process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
@ -768,23 +926,44 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
if process.stderr: if process.stderr:
logger.info(f"Download tool STDERR for format '{format_selector}':\n{process.stderr}") logger.info(f"Download tool STDERR for format '{format_selector}':\n{process.stderr}")
if process.returncode != 0: if process.returncode != 0 or "ERROR:" in process.stderr:
logger.error(f"Download tool failed for format '{format_selector}' with exit code {process.returncode}") logger.error(f"Download tool failed for format '{format_selector}' with exit code {process.returncode}")
raise AirflowException(f"Download command failed for format '{format_selector}'. See logs for details.") if "ERROR:" in process.stderr and process.returncode == 0:
logger.error("Detected 'ERROR:' in stderr, treating as failure despite exit code 0.")
# Pass stderr in the exception for better parsing in the outer try/except block
raise AirflowException(f"Download command failed for format '{format_selector}'. Stderr: {process.stderr}")
output_files = [] output_files = []
for line in process.stdout.strip().split('\n'): if downloader == 'cli':
# For aria-rpc, parse "Download and merge successful: <path>" or "Download successful: <path>" # Parse yt-dlp's verbose output to find the final filename
match = re.search(r'successful: (.+)', line) final_filename = None
if match: for line in process.stdout.strip().split('\n'):
filepath = match.group(1).strip() # Case 1: Simple download, no merge
if os.path.exists(filepath): dest_match = re.search(r'\[download\] Destination: (.*)', line)
output_files.append(filepath) if dest_match:
else: final_filename = dest_match.group(1).strip()
logger.warning(f"File path from aria-rpc output does not exist locally: '{filepath}'")
# For py/cli, it's just the path # Case 2: Formats are merged into a new file. This path is absolute if -o is absolute.
elif os.path.exists(line.strip()): merge_match = re.search(r'\[Merger\] Merging formats into "(.*)"', line)
output_files.append(line.strip()) if merge_match:
final_filename = merge_match.group(1).strip()
if final_filename and os.path.exists(final_filename):
output_files.append(final_filename)
else: # Logic for 'py' and 'aria-rpc'
for line in process.stdout.strip().split('\n'):
# For aria-rpc, parse "Download and merge successful: <path>" or "Download successful: <path>"
match = re.search(r'successful: (.+)', line)
if match:
filepath = match.group(1).strip()
if os.path.exists(filepath):
output_files.append(filepath)
else:
logger.warning(f"File path from aria-rpc output does not exist locally: '{filepath}'")
# For py, it's just the path
elif os.path.exists(line.strip()):
output_files.append(line.strip())
if not params.get('yt_dlp_test_mode') and not output_files: if not params.get('yt_dlp_test_mode') and not output_files:
raise AirflowException(f"Download for format '{format_selector}' finished but no output files were found or exist.") raise AirflowException(f"Download for format '{format_selector}' finished but no output files were found or exist.")
@ -797,7 +976,7 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
"""Probes a file with ffmpeg to check for corruption.""" """Probes a file with ffmpeg to check for corruption."""
logger.info(f"Probing downloaded file: {filename}") logger.info(f"Probing downloaded file: {filename}")
try: try:
subprocess.run(['ffmpeg', '-v', 'error', '-i', filename, '-f', 'null', '-'], check=True, capture_output=True, text=True) subprocess.run(['ffmpeg', '-v', 'error', '-sseof', '-10', '-i', filename, '-c', 'copy', '-f', 'null', '-'], check=True, capture_output=True, text=True)
logger.info(f"SUCCESS: Probe confirmed valid media file: {filename}") logger.info(f"SUCCESS: Probe confirmed valid media file: {filename}")
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
logger.error(f"ffmpeg probe failed for '{filename}'. File may be corrupt.") logger.error(f"ffmpeg probe failed for '{filename}'. File may be corrupt.")
@ -864,30 +1043,58 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
if not formats_to_download_initial: if not formats_to_download_initial:
raise AirflowException("No valid download format selectors were found after parsing.") raise AirflowException("No valid download format selectors were found after parsing.")
# --- Filter requested formats against available formats --- # --- Filter and resolve requested formats ---
final_formats_to_download = [] final_formats_to_download = []
if not available_formats: if not available_formats:
logger.warning("List of available formats is empty. Will attempt to download all requested formats without validation.") logger.warning("List of available formats is empty. Cannot validate numeric selectors, but will attempt to resolve generic selectors.")
final_formats_to_download = formats_to_download_initial
else: for selector in formats_to_download_initial:
for selector in formats_to_download_initial: # A selector is considered generic if it contains keywords like 'best' or filter brackets '[]'.
# A selector can be '140' or '299/298/137' or '140-dashy' is_generic = bool(re.search(r'(best|\[|\])', selector))
if is_generic:
resolved_selector = _resolve_generic_selector(selector, info_json_path, logger)
if resolved_selector:
# The resolver returns a list for '+' selectors, or a string for others.
resolved_formats = resolved_selector if isinstance(resolved_selector, list) else [resolved_selector]
for res_format in resolved_formats:
# Prefer -dashy version if available and the format is a simple numeric ID
if res_format.isdigit() and f"{res_format}-dashy" in available_formats:
final_format = f"{res_format}-dashy"
logger.info(f"Resolved format '{res_format}' from selector '{selector}'. Preferred '-dashy' version: '{final_format}'.")
else:
final_format = res_format
# Validate the chosen format against available formats
if available_formats:
individual_ids = re.split(r'[/+]', final_format)
is_available = any(fid in available_formats for fid in individual_ids)
if is_available:
final_formats_to_download.append(final_format)
else:
logger.warning(f"Resolved format '{final_format}' (from '{selector}') contains no available formats. Skipping.")
else:
# Cannot validate, so we trust the resolver's output.
final_formats_to_download.append(final_format)
else:
logger.warning(f"Could not resolve generic selector '{selector}' using yt-dlp. Skipping.")
else:
# This is a numeric-based selector (e.g., '140' or '299/298' or '140-dashy').
# Validate it against the available formats.
if not available_formats:
logger.warning(f"Cannot validate numeric selector '{selector}' because available formats list is empty. Assuming it's valid.")
final_formats_to_download.append(selector)
continue
individual_ids = re.split(r'[/+]', selector) individual_ids = re.split(r'[/+]', selector)
is_available = any(fid in available_formats for fid in individual_ids)
# Extract the numeric part of the format ID for checking against available_formats
is_available = False
for fid in individual_ids:
numeric_id_match = re.match(r'^\d+', fid)
if numeric_id_match:
numeric_id = numeric_id_match.group(0)
if numeric_id in available_formats:
is_available = True
break # Found a match, no need to check other parts of the selector
if is_available: if is_available:
final_formats_to_download.append(selector) final_formats_to_download.append(selector)
else: else:
logger.warning(f"Requested format selector '{selector}' contains no available formats. Skipping.") logger.warning(f"Requested numeric format selector '{selector}' contains no available formats. Skipping.")
if not final_formats_to_download: if not final_formats_to_download:
raise AirflowException("None of the requested formats are available for this video.") raise AirflowException("None of the requested formats are available for this video.")
@ -909,6 +1116,11 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...") logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...")
delay_between_formats = params.get('delay_between_formats_s', 0)
if delay_between_formats > 0:
logger.info(f"Waiting {delay_between_formats}s before re-download attempt...")
time.sleep(delay_between_formats)
format_ids_to_retry = [] format_ids_to_retry = []
# Since each download is now for a specific selector and the output template # Since each download is now for a specific selector and the output template
# includes the format_id, we can always attempt to extract the format_id # includes the format_id, we can always attempt to extract the format_id
@ -945,7 +1157,7 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
if not final_success_list: if not final_success_list:
raise AirflowException("Download and probe process completed but produced no valid files.") raise AirflowException("Download and probe process completed but produced no valid files.")
if params.get('yt_dlp_cleanup_mode', True): if params.get('yt_dlp_cleanup_mode', False):
logger.info(f"Cleanup mode is enabled. Creating .empty files and deleting originals for {len(final_success_list)} files.") logger.info(f"Cleanup mode is enabled. Creating .empty files and deleting originals for {len(final_success_list)} files.")
for f in final_success_list: for f in final_success_list:
try: try:
@ -965,6 +1177,26 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
if not video_id: if not video_id:
logger.error(f"Could not extract video_id from URL '{original_url}' for final move. Skipping.") logger.error(f"Could not extract video_id from URL '{original_url}' for final move. Skipping.")
else: else:
# --- Rename info.json to a simple format before moving ---
path_to_info_json_for_move = info_json_path # Default to original path
try:
# info_json_path is the full path to the original info.json
if info_json_path and os.path.exists(info_json_path):
new_info_json_name = f"info_{video_id}.json"
new_info_json_path = os.path.join(os.path.dirname(info_json_path), new_info_json_name)
if info_json_path != new_info_json_path:
logger.info(f"Renaming '{info_json_path}' to '{new_info_json_path}' for final delivery.")
os.rename(info_json_path, new_info_json_path)
path_to_info_json_for_move = new_info_json_path
else:
logger.info("info.json already has the simple name. No rename needed.")
else:
logger.warning("Could not find info.json to rename before moving.")
except Exception as rename_e:
logger.error(f"Failed to rename info.json before move: {rename_e}", exc_info=True)
# --- End of rename logic ---
source_dir = download_dir # This is the job_dir_path source_dir = download_dir # This is the job_dir_path
# Group downloads into 10-minute batch folders based on completion time. # Group downloads into 10-minute batch folders based on completion time.
@ -982,18 +1214,65 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
logger.warning(f"Destination '{final_dir_path}' already exists. It will be removed and replaced.") logger.warning(f"Destination '{final_dir_path}' already exists. It will be removed and replaced.")
shutil.rmtree(final_dir_path) shutil.rmtree(final_dir_path)
os.rename(source_dir, final_dir_path) # Create the destination directory and move only the essential files, then clean up the source.
logger.info(f"Successfully moved job to '{final_dir_path}'.") # This ensures no temporary or junk files are carried over.
os.makedirs(final_dir_path)
# 1. Move the info.json file
if path_to_info_json_for_move and os.path.exists(path_to_info_json_for_move):
shutil.move(path_to_info_json_for_move, final_dir_path)
logger.info(f"Moved '{os.path.basename(path_to_info_json_for_move)}' to destination.")
# 2. Move the media files (or their .empty placeholders)
files_to_move = []
if params.get('yt_dlp_cleanup_mode', False):
files_to_move = [f"{f}.empty" for f in final_success_list]
else:
files_to_move = final_success_list
for f in files_to_move:
if os.path.exists(f):
shutil.move(f, final_dir_path)
logger.info(f"Moved '{os.path.basename(f)}' to destination.")
else:
logger.warning(f"File '{f}' expected but not found for moving.")
# 3. Clean up the original source directory
logger.info(f"Cleaning up original source directory '{source_dir}'")
shutil.rmtree(source_dir)
logger.info(f"Successfully moved job to '{final_dir_path}' and cleaned up source.")
except Exception as e: except Exception as e:
logger.error(f"Failed to move completed job directory: {e}", exc_info=True) logger.error(f"Failed to move completed job directory: {e}", exc_info=True)
# Do not fail the task for a move error, just log it. # Do not fail the task for a move error, just log it.
return final_success_list return final_success_list
except Exception as e: except Exception as e:
if 'HTTP Error 403: Forbidden' in str(e): ti = context['task_instance']
logger.warning("Detected 'HTTP Error 403: Forbidden' in download error. Pushing details to XCom for branching.") error_message = str(e)
ti = context['task_instance'] error_code = "DOWNLOAD_FAILED"
ti.xcom_push(key='download_error_details', value={'error_code': 'HTTP_403_FORBIDDEN', 'error_message': str(e)}) msg_lower = error_message.lower()
unrecoverable_patterns = {
"AGE_GATED_SIGN_IN": ['sign in to confirm your age'],
"MEMBERS_ONLY": ['members-only content'],
"VIDEO_PROCESSING": ['processing this video'],
"COPYRIGHT_REMOVAL": ['copyright'],
"GEO_RESTRICTED": ['in your country'],
"PRIVATE_VIDEO": ['private video'],
"VIDEO_REMOVED": ['video has been removed'],
"VIDEO_UNAVAILABLE": ['video unavailable'],
"HTTP_403_FORBIDDEN": ['http error 403: forbidden']
}
for code, patterns in unrecoverable_patterns.items():
if any(p in msg_lower for p in patterns):
error_code = code
break
# Always push details to XCom for the branch operator to inspect.
error_details = {'error_code': error_code, 'error_message': error_message}
ti.xcom_push(key='download_error_details', value=error_details)
raise AirflowException(f"Download and probe failed: {e}") from e raise AirflowException(f"Download and probe failed: {e}") from e
@task @task
@ -1256,6 +1535,12 @@ def continue_processing_loop(**context):
logger.info(f"DAG run '{dag_run.run_id}' does not appear to be triggered by the dispatcher. Stopping processing loop.") logger.info(f"DAG run '{dag_run.run_id}' does not appear to be triggered by the dispatcher. Stopping processing loop.")
return return
dispatcher_dag_id = 'ytdlp_ops_v01_dispatcher'
dag_model = DagModel.get_dagmodel(dispatcher_dag_id)
if dag_model and dag_model.is_paused:
logger.warning(f"Dispatcher DAG '{dispatcher_dag_id}' is paused. Stopping processing loop.")
return
# Create a new unique run_id for the dispatcher. # Create a new unique run_id for the dispatcher.
# Using a timestamp and UUID ensures the ID is unique and does not grow in length over time, # Using a timestamp and UUID ensures the ID is unique and does not grow in length over time,
# preventing database errors. # preventing database errors.
@ -1270,7 +1555,7 @@ def continue_processing_loop(**context):
logger.info(f"Worker finished successfully. Triggering a new dispatcher ('{new_dispatcher_run_id}') to continue the loop.") logger.info(f"Worker finished successfully. Triggering a new dispatcher ('{new_dispatcher_run_id}') to continue the loop.")
trigger_dag( trigger_dag(
dag_id='ytdlp_ops_v01_dispatcher', dag_id=dispatcher_dag_id,
run_id=new_dispatcher_run_id, run_id=new_dispatcher_run_id,
conf=conf_to_pass, conf=conf_to_pass,
replace_microseconds=False replace_microseconds=False
@ -1292,10 +1577,15 @@ def handle_retry_failure_branch(task_id_to_check: str, **context):
error_message = error_details.get('error_message', '').strip() error_message = error_details.get('error_message', '').strip()
error_code = error_details.get('error_code', '').strip() error_code = error_details.get('error_code', '').strip()
# Check if this is an age confirmation error - should not stop the loop # Unrecoverable video errors that should not be retried or treated as system failures.
if "Sign in to confirm your age" in error_message or "confirm your age" in error_message.lower(): unrecoverable_video_errors = [
logger.info(f"Age confirmation error detected on retry from '{task_id_to_check}'. This is a content restriction, not a bot detection issue.") "AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "COPYRIGHT_REMOVAL",
return 'handle_age_restriction_error' "GEO_RESTRICTED", "VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED"
]
if error_code in unrecoverable_video_errors:
logger.warning(f"Unrecoverable video error '{error_code}' detected on retry for '{task_id_to_check}'.")
return 'handle_unrecoverable_video_error'
if error_code == 'TRANSPORT_ERROR': if error_code == 'TRANSPORT_ERROR':
logger.error(f"Fatal Thrift connection error on retry from '{task_id_to_check}'.") logger.error(f"Fatal Thrift connection error on retry from '{task_id_to_check}'.")
@ -1337,6 +1627,17 @@ def handle_download_failure_branch(**context):
# The full task_id for download_and_probe is 'download_processing.download_and_probe' # The full task_id for download_and_probe is 'download_processing.download_and_probe'
download_error_details = ti.xcom_pull(task_ids='download_processing.download_and_probe', key='download_error_details') download_error_details = ti.xcom_pull(task_ids='download_processing.download_and_probe', key='download_error_details')
if download_error_details:
error_code = download_error_details.get('error_code')
unrecoverable_video_errors = [
"AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "COPYRIGHT_REMOVAL",
"GEO_RESTRICTED", "VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED",
"HTTP_403_FORBIDDEN"
]
if error_code in unrecoverable_video_errors:
logger.warning(f"Unrecoverable video error '{error_code}' during download. Skipping.")
return 'handle_unrecoverable_video_error'
if policy == 'retry_with_new_token': if policy == 'retry_with_new_token':
logger.info("Download failed. Policy is to retry with a new token. Branching to retry logic.") logger.info("Download failed. Policy is to retry with a new token. Branching to retry logic.")
return 'retry_logic_for_download' return 'retry_logic_for_download'
@ -1366,6 +1667,58 @@ def coalesce_token_data(get_token_result=None, retry_get_token_result=None):
raise AirflowException("Could not find a successful token result from any attempt.") raise AirflowException("Could not find a successful token result from any attempt.")
@task
def handle_unrecoverable_video_error(**context):
"""
Handles errors for videos that are unavailable (private, removed, etc.).
These are not system failures, so the URL is logged to a 'skipped' queue
and the processing loop continues without marking the run as failed.
"""
params = context['params']
ti = context['task_instance']
url = params.get('url_to_process', 'unknown')
# Collect error details from the failed task
error_details = {}
auth_error = ti.xcom_pull(task_ids='initial_attempt.get_token', key='error_details')
auth_retry_error = ti.xcom_pull(task_ids='retry_logic.retry_get_token', key='error_details')
download_error = ti.xcom_pull(task_ids='download_processing.download_and_probe', key='download_error_details')
if auth_retry_error: error_details = auth_retry_error
elif auth_error: error_details = auth_error
elif download_error: error_details = download_error
error_code = error_details.get('error_code', 'UNKNOWN_VIDEO_ERROR')
error_message = error_details.get('error_message', 'Video is unavailable for an unknown reason.')
logger.warning(f"Skipping URL '{url}' due to unrecoverable video error: {error_code} - {error_message}")
result_data = {
'status': 'skipped',
'end_time': time.time(),
'url': url,
'dag_run_id': context['dag_run'].run_id,
'reason': error_code,
'details': error_message,
'error_details': error_details
}
try:
client = _get_redis_client(params['redis_conn_id'])
skipped_queue = f"{params['queue_name']}_skipped"
progress_queue = f"{params['queue_name']}_progress"
with client.pipeline() as pipe:
pipe.hset(skipped_queue, url, json.dumps(result_data))
pipe.hdel(progress_queue, url)
pipe.execute()
logger.info(f"Stored skipped result for URL '{url}' in '{skipped_queue}' and removed from progress queue.")
except Exception as e:
logger.error(f"Could not report skipped video to Redis: {e}", exc_info=True)
@task @task
def report_bannable_and_continue(**context): def report_bannable_and_continue(**context):
""" """
@ -1428,71 +1781,6 @@ def report_bannable_and_continue(**context):
logger.error(f"Could not report bannable error to Redis: {e}", exc_info=True) logger.error(f"Could not report bannable error to Redis: {e}", exc_info=True)
@task
def handle_age_restriction_error(**context):
"""
Handles age restriction errors specifically. These are content restrictions
that cannot be bypassed by using different accounts, so we report the failure
and continue the processing loop rather than stopping it.
"""
params = context['params']
ti = context['task_instance']
url = params.get('url_to_process', 'unknown')
# Collect error details
error_details = {}
first_token_task_id = 'get_token'
retry_token_task_id = 'retry_get_token'
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
# Use the most recent error details
if retry_token_error:
error_details = retry_token_error
elif first_token_error:
error_details = first_token_error
logger.error(f"Age restriction error for URL '{url}'. This content requires age confirmation and cannot be bypassed.")
# Report failure to Redis so the URL can be marked as failed
try:
client = _get_redis_client(params['redis_conn_id'])
# Update client-specific stats
try:
machine_id = params.get('machine_id') or socket.gethostname()
_update_client_stats(client, params.get('clients', ''), 'failure', url, machine_id, context['dag_run'].run_id)
except Exception as e:
logger.error(f"Could not update client stats on age restriction error: {e}", exc_info=True)
result_data = {
'status': 'failed',
'end_time': time.time(),
'url': url,
'dag_run_id': context['dag_run'].run_id,
'error': 'age_restriction',
'error_message': 'Content requires age confirmation',
'error_details': error_details
}
result_queue = f"{params['queue_name']}_result"
fail_queue = f"{params['queue_name']}_fail"
progress_queue = f"{params['queue_name']}_progress"
with client.pipeline() as pipe:
pipe.hset(result_queue, url, json.dumps(result_data))
pipe.hset(fail_queue, url, json.dumps(result_data))
pipe.hdel(progress_queue, url)
pipe.execute()
logger.info(f"Stored age restriction error for URL '{url}' in '{result_queue}' and '{fail_queue}'.")
except Exception as e:
logger.error(f"Could not report age restriction error to Redis: {e}", exc_info=True)
# This is NOT a fatal error for the processing loop - we just continue with the next URL
# ============================================================================= # =============================================================================
# DAG Definition with TaskGroups # DAG Definition with TaskGroups
# ============================================================================= # =============================================================================
@ -1533,28 +1821,23 @@ with DAG(
description="Policy for handling download or probe failures." description="Policy for handling download or probe failures."
), ),
'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."), 'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."),
'language_code': Param('en-US', type="string", title="[Worker Param] Language Code", description="The language code (e.g., 'en-US', 'de-DE') to use for the YouTube request headers."),
'retry_on_probe_failure': Param(False, type="boolean"), 'retry_on_probe_failure': Param(False, type="boolean"),
'skip_probe': Param(False, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."), 'skip_probe': Param(False, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
'yt_dlp_cleanup_mode': Param(True, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."), 'yt_dlp_cleanup_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean"), 'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean"),
'fragment_retries': Param(2, type="integer", title="[Worker Param] Fragment Retries", description="Number of retries for a fragment before giving up. Default is 2 to fail fast on expired tokens."),
'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."), 'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."),
'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."), 'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."),
'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."), 'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."),
'download_format_preset': Param( 'download_format': Param(
'format_1', 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
type="string", type="string",
enum=['format_1', 'format_2', 'custom'], title="[Worker Param] Download Format",
title="Download Format Preset", description="Custom yt-dlp format string. Common presets: [1] 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' (Default, best quality MP4). [2] '18-dashy/18,140-dashy/140,133-dashy/134-dashy/136-dashy/137-dashy/250-dashy/298-dashy/299-dashy' (Legacy formats). [3] '299-dashy/298-dashy/250-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy' (High-framerate formats)."
description="Select a predefined format string or choose 'custom'.\nformat_1: 18-dashy/18,140-dashy/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformat_2: (299/298/137/136/135/134/133)-dashy"
),
'download_format_custom': Param(
'18-dashy/18,140-dashy/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy',
type="string",
title="Custom Download Format",
description="Custom yt-dlp format string. Used when preset is 'custom'. To download multiple formats, provide a comma-separated list of format IDs (e.g., '137,140')."
), ),
'downloader': Param( 'downloader': Param(
'py', 'cli',
type="string", type="string",
enum=['py', 'aria-rpc', 'cli'], enum=['py', 'aria-rpc', 'cli'],
title="Download Tool", title="Download Tool",
@ -1564,12 +1847,12 @@ with DAG(
'aria_port': Param(6800, type="integer", title="Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server."), 'aria_port': Param(6800, type="integer", title="Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server."),
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="Aria2c Secret", description="For 'aria-rpc' downloader: Secret token."), 'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="Aria2c Secret", description="For 'aria-rpc' downloader: Secret token."),
'yt_dlp_extra_args': Param( 'yt_dlp_extra_args': Param(
'--no-resize-buffer --buffer-size 4M --min-sleep-interval 5 --max-sleep-interval 10', '',
type=["string", "null"], type=["string", "null"],
title="Extra yt-dlp arguments", title="Extra yt-dlp arguments",
), ),
# --- Manual Run / Internal Parameters --- # --- Manual Run / Internal Parameters ---
'manual_url_to_process': Param('iPwdia3gAnk', type=["string", "null"], title="[Manual Run] URL to Process", description="For manual runs, provide a single YouTube URL to process. This is ignored if triggered by the dispatcher."), 'manual_url_to_process': Param('iPwdia3gAnk', type=["string", "null"], title="[Manual Run] URL to Process", description="For manual runs, provide a single YouTube URL, or the special value 'PULL_FROM_QUEUE' to pull one URL from the Redis inbox. This is ignored if triggered by the dispatcher."),
'url_to_process': Param(None, type=["string", "null"], title="[Internal] URL from Dispatcher", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."), 'url_to_process': Param(None, type=["string", "null"], title="[Internal] URL from Dispatcher", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
'worker_queue': Param(None, type=["string", "null"], title="[Internal] Worker Queue", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."), 'worker_queue': Param(None, type=["string", "null"], title="[Internal] Worker Queue", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
} }
@ -1583,7 +1866,7 @@ with DAG(
report_failure_and_stop_task = report_failure_and_stop() report_failure_and_stop_task = report_failure_and_stop()
report_failure_task = report_failure_and_continue() report_failure_task = report_failure_and_continue()
continue_loop_task = continue_processing_loop() continue_loop_task = continue_processing_loop()
age_restriction_task = handle_age_restriction_error() unrecoverable_video_error_task = handle_unrecoverable_video_error()
report_bannable_and_continue_task = report_bannable_and_continue() report_bannable_and_continue_task = report_bannable_and_continue()
# --- Task Group 1: Initial Attempt --- # --- Task Group 1: Initial Attempt ---
@ -1600,7 +1883,7 @@ with DAG(
) )
first_token_attempt >> initial_branch_task first_token_attempt >> initial_branch_task
initial_branch_task >> [fatal_error_task, ban_and_report_immediately_task, age_restriction_task, report_bannable_and_continue_task] initial_branch_task >> [fatal_error_task, ban_and_report_immediately_task, unrecoverable_video_error_task, report_bannable_and_continue_task]
# --- Task Group 2: Retry Logic --- # --- Task Group 2: Retry Logic ---
with TaskGroup("retry_logic", tooltip="Retry logic with account management") as retry_logic_group: with TaskGroup("retry_logic", tooltip="Retry logic with account management") as retry_logic_group:
@ -1650,7 +1933,7 @@ with DAG(
direct_retry_account_task >> coalesced_retry_data direct_retry_account_task >> coalesced_retry_data
coalesced_retry_data >> retry_token_task coalesced_retry_data >> retry_token_task
retry_token_task >> retry_branch_task retry_token_task >> retry_branch_task
retry_branch_task >> [fatal_error_task, report_failure_task, ban_after_retry_report_task, age_restriction_task, report_bannable_and_continue_task] retry_branch_task >> [fatal_error_task, report_failure_task, ban_after_retry_report_task, unrecoverable_video_error_task, report_bannable_and_continue_task]
ban_after_retry_report_task >> report_failure_and_stop_task ban_after_retry_report_task >> report_failure_and_stop_task
# --- Task Group 3: Download and Processing --- # --- Task Group 3: Download and Processing ---
@ -1759,18 +2042,18 @@ with DAG(
# --- DAG Dependencies between TaskGroups --- # --- DAG Dependencies between TaskGroups ---
# Initial attempt can lead to retry logic or direct failure # Initial attempt can lead to retry logic or direct failure
initial_branch_task >> [retry_logic_group, fatal_error_task, ban_and_report_immediately_task, age_restriction_task, report_bannable_and_continue_task] initial_branch_task >> [retry_logic_group, fatal_error_task, ban_and_report_immediately_task, unrecoverable_video_error_task, report_bannable_and_continue_task]
# Ban and report immediately leads to failure reporting # Ban and report immediately leads to failure reporting
ban_and_report_immediately_task >> report_failure_and_stop_task ban_and_report_immediately_task >> report_failure_and_stop_task
# Age restriction error leads to failure reporting and continues the loop # Unrecoverable/bannable errors that don't stop the loop should continue processing
age_restriction_task >> continue_loop_task unrecoverable_video_error_task >> continue_loop_task
report_bannable_and_continue_task >> continue_loop_task report_bannable_and_continue_task >> continue_loop_task
report_failure_task >> continue_loop_task report_failure_task >> continue_loop_task
# Connect download failure branch to the new retry group # Connect download failure branch to the new retry group
download_branch_task >> [retry_logic_for_download_group, report_failure_task, fatal_error_task] download_branch_task >> [retry_logic_for_download_group, report_failure_task, fatal_error_task, unrecoverable_video_error_task]
# Connect success paths to the coalescing tasks # Connect success paths to the coalescing tasks
download_task >> final_files download_task >> final_files

View File

@ -18,7 +18,7 @@ from airflow.utils.dates import days_ago
from airflow.api.common.trigger_dag import trigger_dag from airflow.api.common.trigger_dag import trigger_dag
from airflow.models.dagrun import DagRun from airflow.models.dagrun import DagRun
from airflow.models.dag import DagModel from airflow.models.dag import DagModel
from datetime import timedelta from datetime import timedelta, datetime
import logging import logging
import random import random
import time import time
@ -35,41 +35,6 @@ from thrift.transport import TSocket, TTransport
# Configure logging # Configure logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DEFAULT_REQUEST_PARAMS_JSON = """{
"context_reuse_policy": {
"enabled": true,
"max_age_seconds": 86400,
"reuse_visitor_id": true,
"reuse_cookies": true
},
"token_generation_strategy": {
"youtubei_js": {
"generate_po_token": true,
"generate_gvs_token": true
}
},
"ytdlp_params": {
"use_curl_prefetch": false,
"token_supplement_strategy": {
"youtubepot_bgutilhttp_extractor": {
"enabled": true
}
},
"visitor_id_override": {
"enabled": true
}
},
"session_params": {
"lang": "en-US",
"location": "US",
"deviceCategory": "MOBILE",
"user_agents": {
"youtubei_js": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)",
"yt_dlp": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)"
}
}
}"""
# Default settings # Default settings
DEFAULT_REDIS_CONN_ID = 'redis_default' DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_TOTAL_WORKERS = 8 DEFAULT_TOTAL_WORKERS = 8
@ -188,6 +153,17 @@ def orchestrate_workers_ignition_callable(**context):
dag_run_id = context['dag_run'].run_id dag_run_id = context['dag_run'].run_id
total_triggered = 0 total_triggered = 0
# --- Generate a consistent timestamped prefix for this orchestrator run ---
# This ensures all workers spawned from this run use the same set of accounts.
final_account_pool_prefix = params['account_pool']
if params.get('prepend_client_to_account') and params.get('account_pool_size') is not None:
clients_str = params.get('clients', '')
primary_client = clients_str.split(',')[0].strip() if clients_str else 'unknown'
# Use a timestamp from the orchestrator's run for consistency
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
final_account_pool_prefix = f"{params['account_pool']}_{timestamp}_{primary_client}"
logger.info(f"Generated consistent account prefix for this run: '{final_account_pool_prefix}'")
for i, bunch in enumerate(bunches): for i, bunch in enumerate(bunches):
logger.info(f"--- Triggering Bunch {i+1}/{len(bunches)} (contains {len(bunch)} dispatcher(s)) ---") logger.info(f"--- Triggering Bunch {i+1}/{len(bunches)} (contains {len(bunch)} dispatcher(s)) ---")
for j, _ in enumerate(bunch): for j, _ in enumerate(bunch):
@ -196,6 +172,8 @@ def orchestrate_workers_ignition_callable(**context):
# Pass all orchestrator params to the dispatcher, which will then pass them to the worker. # Pass all orchestrator params to the dispatcher, which will then pass them to the worker.
conf_to_pass = {p: params[p] for p in params} conf_to_pass = {p: params[p] for p in params}
# Override account_pool with the generated prefix
conf_to_pass['account_pool'] = final_account_pool_prefix
logger.info(f"Triggering dispatcher {j+1}/{len(bunch)} in bunch {i+1} (run {total_triggered + 1}/{total_workers}) (Run ID: {run_id})") logger.info(f"Triggering dispatcher {j+1}/{len(bunch)} in bunch {i+1} (run {total_triggered + 1}/{total_workers}) (Run ID: {run_id})")
logger.debug(f"Full conf for dispatcher run {run_id}: {conf_to_pass}") logger.debug(f"Full conf for dispatcher run {run_id}: {conf_to_pass}")
@ -294,17 +272,12 @@ with DAG(
"'proceed_loop_under_manual_inspection': **BEWARE: MANUAL SUPERVISION REQUIRED.** Marks the URL as failed but continues the processing loop. Use this only when you can manually intervene by pausing the dispatcher DAG or creating a lock file (`/opt/airflow/inputfiles/AIRFLOW.PREVENT_URL_PULL.lockfile`) to prevent a runaway failure loop." "'proceed_loop_under_manual_inspection': **BEWARE: MANUAL SUPERVISION REQUIRED.** Marks the URL as failed but continues the processing loop. Use this only when you can manually intervene by pausing the dispatcher DAG or creating a lock file (`/opt/airflow/inputfiles/AIRFLOW.PREVENT_URL_PULL.lockfile`) to prevent a runaway failure loop."
"'stop_loop_on_auth_proceed_on_download_error': **(Default)** Stops the loop on an authentication/token error (like 'stop_loop'), but continues the loop on a download/probe error (like 'proceed...')." "'stop_loop_on_auth_proceed_on_download_error': **(Default)** Stops the loop on an authentication/token error (like 'stop_loop'), but continues the loop on a download/probe error (like 'proceed...')."
), ),
'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."), 'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with per-request parameters to override server defaults. Can be a full JSON object or comma-separated key=value pairs (e.g., 'session_params.location=DE,ytdlp_params.skip_cache=true')."),
'language_code': Param('en-US', type="string", title="[Worker Param] Language Code", description="The language code (e.g., 'en-US', 'de-DE') to use for the YouTube request headers."),
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."), 'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
'clients': Param( 'clients': Param(
'tv_simply', 'tv_simply',
type="string", type="string",
enum=[
'tv_simply',
'mweb',
'tv',
'custom',
],
title="[Worker Param] Clients", title="[Worker Param] Clients",
description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details." description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details."
), ),

View File

@ -249,27 +249,20 @@ with DAG(
'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."), 'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."),
'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."), 'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."),
'skip_probe': Param(True, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."), 'skip_probe': Param(True, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
'yt_dlp_cleanup_mode': Param(True, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."), 'yt_dlp_cleanup_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
'fragment_retries': Param(2, type="integer", title="[Worker Param] Fragment Retries", description="Number of retries for a fragment before giving up."), 'fragment_retries': Param(2, type="integer", title="[Worker Param] Fragment Retries", description="Number of retries for a fragment before giving up."),
'limit_rate': Param('5M', type=["string", "null"], title="[Worker Param] Limit Rate", description="Download speed limit (e.g., 50K, 4.2M)."), 'limit_rate': Param('5M', type=["string", "null"], title="[Worker Param] Limit Rate", description="Download speed limit (e.g., 50K, 4.2M)."),
'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."), 'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."),
'min_sleep_interval': Param(5, type="integer", title="[Worker Param] Min Sleep Interval", description="Minimum time to sleep between downloads (seconds)."), 'min_sleep_interval': Param(5, type="integer", title="[Worker Param] Min Sleep Interval", description="Minimum time to sleep between downloads (seconds)."),
'max_sleep_interval': Param(10, type="integer", title="[Worker Param] Max Sleep Interval", description="Maximum time to sleep between downloads (seconds)."), 'max_sleep_interval': Param(10, type="integer", title="[Worker Param] Max Sleep Interval", description="Maximum time to sleep between downloads (seconds)."),
'download_format_preset': Param( 'download_format': Param(
'formats_2', 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
type="string", type="string",
enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'], title="[Worker Param] Download Format",
title="[Worker Param] Download Format Preset", description="Custom yt-dlp format string. Common presets: [1] 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' (Default, best quality MP4). [2] '18-dashy/18,140-dashy/140,133-dashy/134-dashy/136-dashy/137-dashy/250-dashy/298-dashy/299-dashy' (Legacy formats). [3] '299-dashy/298-dashy/250-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy' (High-framerate formats)."
description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformats_0: 18,140\nformats_2: 18,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
),
'download_format_custom': Param(
'18,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy',
type="string",
title="[Worker Param] Custom Download Format",
description="Custom yt-dlp format string. Used when preset is 'custom'. E.g., 'ba[ext=m4a]/bestaudio/best'."
), ),
'downloader': Param( 'downloader': Param(
'py', 'cli',
type="string", type="string",
enum=['py', 'aria-rpc', 'cli'], enum=['py', 'aria-rpc', 'cli'],
title="[Worker Param] Download Tool", title="[Worker Param] Download Tool",
@ -279,7 +272,7 @@ with DAG(
'aria_port': Param(6800, type="integer", title="[Worker Param] Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_PORT'."), 'aria_port': Param(6800, type="integer", title="[Worker Param] Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_PORT'."),
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="[Worker Param] Aria2c Secret", description="For 'aria-rpc' downloader: Secret token. Can be set via Airflow Variable 'YTDLP_ARIA_SECRET'."), 'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="[Worker Param] Aria2c Secret", description="For 'aria-rpc' downloader: Secret token. Can be set via Airflow Variable 'YTDLP_ARIA_SECRET'."),
'yt_dlp_extra_args': Param( 'yt_dlp_extra_args': Param(
'--restrict-filenames', '--no-part --restrict-filenames',
type=["string", "null"], type=["string", "null"],
title="[Worker Param] Extra yt-dlp arguments", title="[Worker Param] Extra yt-dlp arguments",
description="Extra command-line arguments for yt-dlp during download." description="Extra command-line arguments for yt-dlp during download."

View File

@ -17,14 +17,14 @@ from __future__ import annotations
from airflow.decorators import task, task_group from airflow.decorators import task, task_group
from airflow.exceptions import AirflowException, AirflowSkipException from airflow.exceptions import AirflowException, AirflowSkipException
from airflow.models import Variable from airflow.models import Variable
from airflow.models.dag import DAG from airflow.models.dag import DAG, DagModel
from airflow.models.param import Param from airflow.models.param import Param
from airflow.models.xcom_arg import XComArg from airflow.models.xcom_arg import XComArg
from airflow.operators.dummy import DummyOperator from airflow.operators.dummy import DummyOperator
from airflow.utils.dates import days_ago from airflow.utils.dates import days_ago
from airflow.utils.task_group import TaskGroup from airflow.utils.task_group import TaskGroup
from airflow.api.common.trigger_dag import trigger_dag from airflow.api.common.trigger_dag import trigger_dag
from copy import copy import copy
from datetime import datetime, timedelta from datetime import datetime, timedelta
import concurrent.futures import concurrent.futures
import json import json
@ -143,10 +143,12 @@ DEFAULT_REQUEST_PARAMS = {
"session_params": { "session_params": {
"lang": "en-US", "lang": "en-US",
"location": "US", "location": "US",
"deviceCategory": "MOBILE", "deviceCategory": "TV",
"user_agents": { "user_agents": {
"youtubei_js": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)", # "youtubei_js": "Mozilla/5.0 (Linux; Cobalt) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"yt_dlp": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)" "youtubei_js": "Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version",
# "yt_dlp": "Mozilla/5.0 (Linux; Cobalt) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
"yt_dlp": "Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version"
} }
} }
} }
@ -208,14 +210,9 @@ def _get_account_pool(params: dict) -> list:
is_prefix_mode = True is_prefix_mode = True
pool_size = int(pool_size_param) pool_size = int(pool_size_param)
if params.get('prepend_client_to_account', True): # The orchestrator now generates the full prefix if prepend_client_to_account is True.
clients_str = params.get('clients', '') # The worker just appends the numbers.
primary_client = clients_str.split(',')[0].strip() if clients_str else 'unknown' accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
new_prefix = f"{prefix}_{timestamp}_{primary_client}"
accounts = [f"{new_prefix}_{i:02d}" for i in range(1, pool_size + 1)]
else:
accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
else: else:
accounts = [prefix] accounts = [prefix]
@ -347,12 +344,26 @@ def get_url_and_assign_account(**context):
# For manual runs, we fall back to 'manual_url_to_process'. # For manual runs, we fall back to 'manual_url_to_process'.
url_to_process = params.get('url_to_process') url_to_process = params.get('url_to_process')
if not url_to_process: if not url_to_process:
url_to_process = params.get('manual_url_to_process') manual_url_input = params.get('manual_url_to_process')
if url_to_process: if manual_url_input:
logger.info(f"Using URL from manual run parameter: '{url_to_process}'") logger.info(f"Using URL from manual run parameter: '{manual_url_input}'")
if manual_url_input == 'PULL_FROM_QUEUE':
logger.info("Manual run is set to pull from queue.")
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
queue_name = params.get('queue_name', DEFAULT_QUEUE_NAME)
inbox_queue = f"{queue_name}_inbox"
client = _get_redis_client(redis_conn_id)
url_bytes = client.lpop(inbox_queue)
if not url_bytes:
logger.info("Redis queue is empty. No work to do. Skipping task.")
raise AirflowSkipException("Redis queue is empty. No work to do.")
url_to_process = url_bytes.decode('utf-8')
logger.info(f"Pulled URL '{url_to_process}' from queue '{inbox_queue}'.")
else:
url_to_process = manual_url_input
if not url_to_process: if not url_to_process:
raise AirflowException("No URL to process. For manual runs, please provide a URL in the 'manual_url_to_process' parameter.") raise AirflowException("No URL to process. For manual runs, please provide a URL in the 'manual_url_to_process' parameter, or 'PULL_FROM_QUEUE'.")
logger.info(f"Received URL '{url_to_process}' to process.") logger.info(f"Received URL '{url_to_process}' to process.")
# Mark the URL as in-progress in Redis # Mark the URL as in-progress in Redis
@ -399,9 +410,29 @@ def get_token(initial_data: dict, **context):
host, port = params['service_ip'], int(params['service_port']) host, port = params['service_ip'], int(params['service_port'])
machine_id = params.get('machine_id') or socket.gethostname() machine_id = params.get('machine_id') or socket.gethostname()
clients = params.get('clients') clients = params.get('clients')
request_params_json = params.get('request_params_json', '{}') request_params_json = params.get('request_params_json')
language_code = params.get('language_code')
assigned_proxy_url = params.get('assigned_proxy_url') assigned_proxy_url = params.get('assigned_proxy_url')
if language_code:
try:
params_dict = json.loads(request_params_json)
if not params_dict:
params_dict = copy.deepcopy(DEFAULT_REQUEST_PARAMS)
logger.info(f"Setting language for request: {language_code}")
if 'session_params' not in params_dict:
params_dict['session_params'] = {}
params_dict['session_params']['lang'] = language_code
request_params_json = json.dumps(params_dict)
except (json.JSONDecodeError, TypeError):
logger.warning("Could not parse request_params_json as JSON. Treating as key=value pairs and appending language code.")
lang_kv = f"session_params.lang={language_code}"
if request_params_json:
request_params_json += f",{lang_kv}"
else:
request_params_json = lang_kv
video_id = _extract_video_id(url) video_id = _extract_video_id(url)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
job_dir_name = f"{timestamp}-{video_id or 'unknown'}" job_dir_name = f"{timestamp}-{video_id or 'unknown'}"
@ -445,18 +476,39 @@ def get_token(initial_data: dict, **context):
if process.returncode != 0: if process.returncode != 0:
error_message = "ytops-client failed. See logs for details." error_message = "ytops-client failed. See logs for details."
for line in reversed(process.stderr.strip().split('\n')): # Try to find a more specific error message from the Thrift client's output
if 'ERROR' in line or 'Thrift error' in line or 'Connection to server failed' in line: thrift_error_match = re.search(r'A Thrift error occurred: (.*)', process.stderr)
error_message = line.strip() if thrift_error_match:
break error_message = thrift_error_match.group(1).strip()
else: # Fallback to old line-by-line parsing
for line in reversed(process.stderr.strip().split('\n')):
if 'ERROR' in line or 'Thrift error' in line or 'Connection to server failed' in line:
error_message = line.strip()
break
# Determine error code for branching logic
error_code = 'GET_INFO_CLIENT_FAIL' error_code = 'GET_INFO_CLIENT_FAIL'
if "BOT_DETECTED" in process.stderr: stderr_lower = process.stderr.lower()
error_code = "BOT_DETECTED"
elif "BOT_DETECTION_SIGN_IN_REQUIRED" in process.stderr: # These patterns should match the error codes from PBUserException and others
error_code = "BOT_DETECTION_SIGN_IN_REQUIRED" error_patterns = {
elif "Connection to server failed" in process.stderr: "BOT_DETECTED": ["bot_detected"],
error_code = "TRANSPORT_ERROR" "BOT_DETECTION_SIGN_IN_REQUIRED": ["bot_detection_sign_in_required"],
"TRANSPORT_ERROR": ["connection to server failed"],
"PRIVATE_VIDEO": ["private video"],
"COPYRIGHT_REMOVAL": ["copyright"],
"GEO_RESTRICTED": ["in your country"],
"VIDEO_REMOVED": ["video has been removed"],
"VIDEO_UNAVAILABLE": ["video unavailable"],
"MEMBERS_ONLY": ["members-only"],
"AGE_GATED_SIGN_IN": ["sign in to confirm your age"],
"VIDEO_PROCESSING": ["processing this video"],
}
for code, patterns in error_patterns.items():
if any(p in stderr_lower for p in patterns):
error_code = code
break # Found a match, stop searching
error_details = { error_details = {
'error_message': error_message, 'error_message': error_message,
@ -471,8 +523,23 @@ def get_token(initial_data: dict, **context):
if proxy_match: if proxy_match:
proxy = proxy_match.group(1).strip() proxy = proxy_match.group(1).strip()
# Rename the info.json to include the proxy for the download worker
final_info_json_path = info_json_path
if proxy:
# Sanitize for filename: replace '://' which is invalid in paths. Colons are usually fine.
sanitized_proxy = proxy.replace('://', '---')
new_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}_proxy_{sanitized_proxy}.json"
new_path = os.path.join(job_dir_path, new_filename)
try:
os.rename(info_json_path, new_path)
final_info_json_path = new_path
logger.info(f"Renamed info.json to include proxy: {new_path}")
except OSError as e:
logger.error(f"Failed to rename info.json to include proxy: {e}. Using original path.")
return { return {
'info_json_path': info_json_path, 'info_json_path': final_info_json_path,
'job_dir_path': job_dir_path, 'job_dir_path': job_dir_path,
'socks_proxy': proxy, 'socks_proxy': proxy,
'ytdlp_command': None, 'ytdlp_command': None,
@ -498,10 +565,15 @@ def handle_bannable_error_branch(task_id_to_check: str, **context):
error_code = error_details.get('error_code', '').strip() error_code = error_details.get('error_code', '').strip()
policy = params.get('on_bannable_failure', 'retry_with_new_account') policy = params.get('on_bannable_failure', 'retry_with_new_account')
# Check if this is an age confirmation error - should not stop the loop # Unrecoverable video errors that should not be retried or treated as system failures.
if "Sign in to confirm your age" in error_message or "confirm your age" in error_message.lower(): unrecoverable_video_errors = [
logger.info(f"Age confirmation error detected for '{task_id_to_check}'. This is a content restriction, not a bot detection issue.") "AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "COPYRIGHT_REMOVAL",
return 'handle_age_restriction_error' "GEO_RESTRICTED", "VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED"
]
if error_code in unrecoverable_video_errors:
logger.warning(f"Unrecoverable video error '{error_code}' detected for '{task_id_to_check}'. This is a content issue, not a system failure.")
return 'handle_unrecoverable_video_error'
# Fatal Thrift connection errors that should stop all processing. # Fatal Thrift connection errors that should stop all processing.
if error_code == 'TRANSPORT_ERROR': if error_code == 'TRANSPORT_ERROR':
@ -718,6 +790,59 @@ def push_auth_success_to_redis(initial_data: dict, token_data: dict, **context):
logger.info(f"Pushed successful auth data for URL '{url}' to '{dl_inbox_queue}'.") logger.info(f"Pushed successful auth data for URL '{url}' to '{dl_inbox_queue}'.")
logger.info(f"Stored success result for auth on URL '{url}' in '{auth_result_queue}'.") logger.info(f"Stored success result for auth on URL '{url}' in '{auth_result_queue}'.")
@task
def handle_unrecoverable_video_error(**context):
"""
Handles errors for videos that are unavailable (private, removed, etc.).
These are not system failures, so the URL is logged to a 'skipped' queue
and the processing loop continues without marking the run as failed.
"""
params = context['params']
ti = context['task_instance']
url = params.get('url_to_process', 'unknown')
# Collect error details from the failed get_token task
error_details = {}
first_token_error = ti.xcom_pull(task_ids='initial_attempt.get_token', key='error_details')
retry_token_error = ti.xcom_pull(task_ids='retry_logic.retry_get_token', key='error_details')
if retry_token_error:
error_details = retry_token_error
elif first_token_error:
error_details = first_token_error
error_code = error_details.get('error_code', 'UNKNOWN_VIDEO_ERROR')
error_message = error_details.get('error_message', 'Video is unavailable for an unknown reason.')
logger.warning(f"Skipping URL '{url}' due to unrecoverable video error: {error_code} - {error_message}")
result_data = {
'status': 'skipped',
'end_time': time.time(),
'url': url,
'dag_run_id': context['dag_run'].run_id,
'reason': error_code,
'details': error_message,
'error_details': error_details
}
try:
client = _get_redis_client(params['redis_conn_id'])
# New queue for skipped videos
skipped_queue = f"{params['queue_name']}_skipped"
progress_queue = f"{params['queue_name']}_progress"
with client.pipeline() as pipe:
pipe.hset(skipped_queue, url, json.dumps(result_data))
pipe.hdel(progress_queue, url)
pipe.execute()
logger.info(f"Stored skipped result for URL '{url}' in '{skipped_queue}' and removed from progress queue.")
except Exception as e:
logger.error(f"Could not report skipped video to Redis: {e}", exc_info=True)
@task(trigger_rule='one_failed') @task(trigger_rule='one_failed')
def report_failure_and_continue(**context): def report_failure_and_continue(**context):
""" """
@ -732,8 +857,8 @@ def report_failure_and_continue(**context):
error_details = {} error_details = {}
# Check for error details from get_token tasks # Check for error details from get_token tasks
first_token_task_id = 'get_token' first_token_task_id = 'initial_attempt.get_token'
retry_token_task_id = 'retry_get_token' retry_token_task_id = 'retry_logic.retry_get_token'
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details') first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details') retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
@ -798,8 +923,8 @@ def handle_fatal_error(**context):
# Collect error details # Collect error details
error_details = {} error_details = {}
first_token_task_id = 'get_token' first_token_task_id = 'initial_attempt.get_token'
retry_token_task_id = 'retry_get_token' retry_token_task_id = 'retry_logic.retry_get_token'
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details') first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details') retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
@ -866,6 +991,12 @@ def continue_processing_loop(**context):
logger.info(f"DAG run '{dag_run.run_id}' does not appear to be triggered by the dispatcher. Stopping processing loop.") logger.info(f"DAG run '{dag_run.run_id}' does not appear to be triggered by the dispatcher. Stopping processing loop.")
return return
dispatcher_dag_id = 'ytdlp_ops_v02_dispatcher_auth'
dag_model = DagModel.get_dagmodel(dispatcher_dag_id)
if dag_model and dag_model.is_paused:
logger.warning(f"Dispatcher DAG '{dispatcher_dag_id}' is paused. Stopping processing loop.")
return
# Create a new unique run_id for the dispatcher. # Create a new unique run_id for the dispatcher.
# Using a timestamp and UUID ensures the ID is unique and does not grow in length over time, # Using a timestamp and UUID ensures the ID is unique and does not grow in length over time,
# preventing database errors. # preventing database errors.
@ -880,7 +1011,7 @@ def continue_processing_loop(**context):
logger.info(f"Worker finished successfully. Triggering a new dispatcher ('{new_dispatcher_run_id}') to continue the loop.") logger.info(f"Worker finished successfully. Triggering a new dispatcher ('{new_dispatcher_run_id}') to continue the loop.")
trigger_dag( trigger_dag(
dag_id='ytdlp_ops_v02_dispatcher_auth', dag_id=dispatcher_dag_id,
run_id=new_dispatcher_run_id, run_id=new_dispatcher_run_id,
conf=conf_to_pass, conf=conf_to_pass,
replace_microseconds=False replace_microseconds=False
@ -902,10 +1033,15 @@ def handle_retry_failure_branch(task_id_to_check: str, **context):
error_message = error_details.get('error_message', '').strip() error_message = error_details.get('error_message', '').strip()
error_code = error_details.get('error_code', '').strip() error_code = error_details.get('error_code', '').strip()
# Check if this is an age confirmation error - should not stop the loop # Unrecoverable video errors that should not be retried or treated as system failures.
if "Sign in to confirm your age" in error_message or "confirm your age" in error_message.lower(): unrecoverable_video_errors = [
logger.info(f"Age confirmation error detected on retry from '{task_id_to_check}'. This is a content restriction, not a bot detection issue.") "AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "COPYRIGHT_REMOVAL",
return 'handle_age_restriction_error' "GEO_RESTRICTED", "VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED"
]
if error_code in unrecoverable_video_errors:
logger.warning(f"Unrecoverable video error '{error_code}' detected on retry for '{task_id_to_check}'.")
return 'handle_unrecoverable_video_error'
if error_code == 'TRANSPORT_ERROR': if error_code == 'TRANSPORT_ERROR':
logger.error(f"Fatal Thrift connection error on retry from '{task_id_to_check}'.") logger.error(f"Fatal Thrift connection error on retry from '{task_id_to_check}'.")
@ -964,8 +1100,8 @@ def report_bannable_and_continue(**context):
# Collect error details # Collect error details
error_details = {} error_details = {}
first_token_task_id = 'get_token' first_token_task_id = 'initial_attempt.get_token'
retry_token_task_id = 'retry_get_token' retry_token_task_id = 'retry_logic.retry_get_token'
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details') first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details') retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
@ -1014,71 +1150,6 @@ def report_bannable_and_continue(**context):
logger.error(f"Could not report bannable error to Redis: {e}", exc_info=True) logger.error(f"Could not report bannable error to Redis: {e}", exc_info=True)
@task
def handle_age_restriction_error(**context):
"""
Handles age restriction errors specifically. These are content restrictions
that cannot be bypassed by using different accounts, so we report the failure
and continue the processing loop rather than stopping it.
"""
params = context['params']
ti = context['task_instance']
url = params.get('url_to_process', 'unknown')
# Collect error details
error_details = {}
first_token_task_id = 'get_token'
retry_token_task_id = 'retry_get_token'
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
# Use the most recent error details
if retry_token_error:
error_details = retry_token_error
elif first_token_error:
error_details = first_token_error
logger.error(f"Age restriction error for URL '{url}'. This content requires age confirmation and cannot be bypassed.")
# Report failure to Redis so the URL can be marked as failed
try:
client = _get_redis_client(params['redis_conn_id'])
# Update client-specific stats
try:
machine_id = params.get('machine_id') or socket.gethostname()
_update_client_stats(client, params.get('clients', ''), 'failure', url, machine_id, context['dag_run'].run_id)
except Exception as e:
logger.error(f"Could not update client stats on age restriction error: {e}", exc_info=True)
result_data = {
'status': 'failed',
'end_time': time.time(),
'url': url,
'dag_run_id': context['dag_run'].run_id,
'error': 'age_restriction',
'error_message': 'Content requires age confirmation',
'error_details': error_details
}
result_queue = f"{params['queue_name']}_result"
fail_queue = f"{params['queue_name']}_fail"
progress_queue = f"{params['queue_name']}_progress"
with client.pipeline() as pipe:
pipe.hset(result_queue, url, json.dumps(result_data))
pipe.hset(fail_queue, url, json.dumps(result_data))
pipe.hdel(progress_queue, url)
pipe.execute()
logger.info(f"Stored age restriction error for URL '{url}' in '{result_queue}' and '{fail_queue}'.")
except Exception as e:
logger.error(f"Could not report age restriction error to Redis: {e}", exc_info=True)
# This is NOT a fatal error for the processing loop - we just continue with the next URL
# ============================================================================= # =============================================================================
# DAG Definition with TaskGroups # DAG Definition with TaskGroups
# ============================================================================= # =============================================================================
@ -1106,9 +1177,10 @@ with DAG(
'timeout': Param(DEFAULT_TIMEOUT, type="integer"), 'timeout': Param(DEFAULT_TIMEOUT, type="integer"),
'on_bannable_failure': Param('stop_loop_on_auth_proceed_on_download_error', type="string", enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error', 'proceed_loop_under_manual_inspection', 'stop_loop_on_auth_proceed_on_download_error']), 'on_bannable_failure': Param('stop_loop_on_auth_proceed_on_download_error', type="string", enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error', 'proceed_loop_under_manual_inspection', 'stop_loop_on_auth_proceed_on_download_error']),
'request_params_json': Param(json.dumps(DEFAULT_REQUEST_PARAMS), type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."), 'request_params_json': Param(json.dumps(DEFAULT_REQUEST_PARAMS), type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."),
'language_code': Param('en-US', type="string", title="[Worker Param] Language Code", description="The language code (e.g., 'en-US', 'de-DE') to use for the YouTube request headers."),
'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean"), 'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean"),
# --- Manual Run / Internal Parameters --- # --- Manual Run / Internal Parameters ---
'manual_url_to_process': Param('iPwdia3gAnk', type=["string", "null"], title="[Manual Run] URL to Process", description="For manual runs, provide a single YouTube URL to process. This is ignored if triggered by the dispatcher."), 'manual_url_to_process': Param('iPwdia3gAnk', type=["string", "null"], title="[Manual Run] URL to Process", description="For manual runs, provide a single YouTube URL, or the special value 'PULL_FROM_QUEUE' to pull one URL from the Redis inbox. This is ignored if triggered by the dispatcher."),
'url_to_process': Param(None, type=["string", "null"], title="[Internal] URL from Dispatcher", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."), 'url_to_process': Param(None, type=["string", "null"], title="[Internal] URL from Dispatcher", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
'worker_queue': Param(None, type=["string", "null"], title="[Internal] Worker Queue", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."), 'worker_queue': Param(None, type=["string", "null"], title="[Internal] Worker Queue", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
} }
@ -1121,7 +1193,7 @@ with DAG(
fatal_error_task = handle_fatal_error() fatal_error_task = handle_fatal_error()
report_failure_task = report_failure_and_continue() report_failure_task = report_failure_and_continue()
continue_loop_task = continue_processing_loop() continue_loop_task = continue_processing_loop()
age_restriction_task = handle_age_restriction_error() unrecoverable_video_error_task = handle_unrecoverable_video_error()
report_bannable_and_continue_task = report_bannable_and_continue() report_bannable_and_continue_task = report_bannable_and_continue()
# --- Task Group 1: Initial Attempt --- # --- Task Group 1: Initial Attempt ---
@ -1138,7 +1210,7 @@ with DAG(
) )
first_token_attempt >> initial_branch_task first_token_attempt >> initial_branch_task
initial_branch_task >> [fatal_error_task, ban_and_report_immediately_task, age_restriction_task, report_bannable_and_continue_task] initial_branch_task >> [fatal_error_task, ban_and_report_immediately_task, unrecoverable_video_error_task, report_bannable_and_continue_task]
# --- Task Group 2: Retry Logic --- # --- Task Group 2: Retry Logic ---
with TaskGroup("retry_logic", tooltip="Retry logic with account management") as retry_logic_group: with TaskGroup("retry_logic", tooltip="Retry logic with account management") as retry_logic_group:
@ -1188,7 +1260,7 @@ with DAG(
direct_retry_account_task >> coalesced_retry_data direct_retry_account_task >> coalesced_retry_data
coalesced_retry_data >> retry_token_task coalesced_retry_data >> retry_token_task
retry_token_task >> retry_branch_task retry_token_task >> retry_branch_task
retry_branch_task >> [fatal_error_task, report_failure_task, ban_after_retry_report_task, age_restriction_task, report_bannable_and_continue_task] retry_branch_task >> [fatal_error_task, report_failure_task, ban_after_retry_report_task, unrecoverable_video_error_task, report_bannable_and_continue_task]
ban_after_retry_report_task >> report_failure_task ban_after_retry_report_task >> report_failure_task
# --- Task Group 3: Success/Continuation Logic --- # --- Task Group 3: Success/Continuation Logic ---
@ -1210,7 +1282,7 @@ with DAG(
# --- DAG Dependencies between TaskGroups --- # --- DAG Dependencies between TaskGroups ---
# Initial attempt can lead to retry logic or direct failure # Initial attempt can lead to retry logic or direct failure
initial_branch_task >> [retry_logic_group, fatal_error_task, ban_and_report_immediately_task, age_restriction_task, report_bannable_and_continue_task] initial_branch_task >> [retry_logic_group, fatal_error_task, ban_and_report_immediately_task, unrecoverable_video_error_task, report_bannable_and_continue_task]
# A successful initial attempt bypasses retry and goes straight to the success group # A successful initial attempt bypasses retry and goes straight to the success group
initial_attempt_group >> success_group initial_attempt_group >> success_group
@ -1222,6 +1294,6 @@ with DAG(
# Ban and report immediately leads to failure reporting # Ban and report immediately leads to failure reporting
ban_and_report_immediately_task >> report_failure_task ban_and_report_immediately_task >> report_failure_task
# Age restriction error leads to failure reporting and continues the loop # Unrecoverable/bannable errors that don't stop the loop should continue processing
age_restriction_task >> continue_loop_task unrecoverable_video_error_task >> continue_loop_task
report_bannable_and_continue_task >> continue_loop_task report_bannable_and_continue_task >> continue_loop_task

File diff suppressed because it is too large Load Diff

View File

@ -41,17 +41,24 @@ def run_s3_upload_batch(**context):
Dry run mode is non-destructive and will pause briefly after checking to prevent tight loops. Dry run mode is non-destructive and will pause briefly after checking to prevent tight loops.
""" """
params = context['params'] params = context['params']
ti = context['task_instance']
# Log the configured execution timeout for debugging purposes.
# This helps verify that the timeout setting from the DAG file is being applied.
timeout_delta = ti.task.execution_timeout
logger.info(f"Task is configured with execution_timeout: {timeout_delta}")
concurrency = params['concurrency'] concurrency = params['concurrency']
mode = params['mode'] mode = params['mode']
dry_run = params['dry_run'] dry_run = params['dry_run']
sleep_interval_min = params['sleep_if_no_videos_min'] sleep_interval_min = params['sleep_if_no_videos_min']
sleep_interval_sec = sleep_interval_min * 60 sleep_interval_sec = sleep_interval_min * 60
s3_conn_id = params['s3_conn_id'] s3_conn_id = params['s3_conn_id']
s3_bucket = params['s3_bucket_name']
s3_access_key_id = None s3_access_key_id = None
s3_secret_access_key = None s3_secret_access_key = None
s3_endpoint = None s3_endpoint = None
s3_bucket = None
s3_region = None s3_region = None
config_source = "Unknown" config_source = "Unknown"
profile_name = "rusonyx" profile_name = "rusonyx"
@ -68,12 +75,11 @@ def run_s3_upload_batch(**context):
s3_endpoint = s3_conn.host s3_endpoint = s3_conn.host
extra_config = s3_conn.extra_dejson extra_config = s3_conn.extra_dejson
s3_bucket = extra_config.get('bucket')
s3_region = extra_config.get('region_name') s3_region = extra_config.get('region_name')
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_bucket, s3_region]): if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_region]):
logger.warning("S3 connection from Airflow is missing one or more required fields. Will attempt to fall back to environment variables.") logger.warning("S3 connection from Airflow is missing one or more required fields (excluding bucket). Will attempt to fall back to environment variables.")
s3_access_key_id = s3_secret_access_key = s3_endpoint = s3_bucket = s3_region = None # Reset all s3_access_key_id = s3_secret_access_key = s3_endpoint = s3_region = None # Reset all
else: else:
config_source = f"Airflow Connection '{s3_conn_id}'" config_source = f"Airflow Connection '{s3_conn_id}'"
profile_name = "rusonyx-airflow" profile_name = "rusonyx-airflow"
@ -82,17 +88,16 @@ def run_s3_upload_batch(**context):
logger.warning(f"Failed to load S3 configuration from Airflow connection '{s3_conn_id}': {e}. Will attempt to fall back to environment variables.") logger.warning(f"Failed to load S3 configuration from Airflow connection '{s3_conn_id}': {e}. Will attempt to fall back to environment variables.")
# --- Attempt 2: Fallback to Environment Variables --- # --- Attempt 2: Fallback to Environment Variables ---
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_bucket, s3_region]): if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_region]):
try: try:
logger.info("Attempting to load S3 configuration from environment variables as a fallback.") logger.info("Attempting to load S3 configuration from environment variables as a fallback.")
s3_access_key_id = os.environ['S3_DELIVERY_AWS_ACCESS_KEY_ID'] s3_access_key_id = os.environ['S3_DELIVERY_AWS_ACCESS_KEY_ID']
s3_secret_access_key = os.environ['S3_DELIVERY_AWS_SECRET_ACCESS_KEY'] s3_secret_access_key = os.environ['S3_DELIVERY_AWS_SECRET_ACCESS_KEY']
s3_endpoint = os.environ['S3_DELIVERY_ENDPOINT'] s3_endpoint = os.environ['S3_DELIVERY_ENDPOINT']
s3_bucket = os.environ['S3_DELIVERY_BUCKET']
s3_region = os.environ['S3_DELIVERY_AWS_REGION'] s3_region = os.environ['S3_DELIVERY_AWS_REGION']
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_bucket, s3_region]): if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_region]):
raise ValueError("One or more S3 configuration environment variables are empty.") raise ValueError("One or more S3 configuration environment variables are empty (excluding bucket).")
config_source = "Environment Variables" config_source = "Environment Variables"
profile_name = "rusonyx" profile_name = "rusonyx"
@ -100,6 +105,9 @@ def run_s3_upload_batch(**context):
logger.error(f"Having problems reading S3 configuration from environment variables: {e}", exc_info=True) logger.error(f"Having problems reading S3 configuration from environment variables: {e}", exc_info=True)
raise AirflowException("S3 configuration is missing. Could not load from Airflow connection or environment variables.") raise AirflowException("S3 configuration is missing. Could not load from Airflow connection or environment variables.")
if not s3_bucket:
raise AirflowException("S3 bucket name is not specified in DAG parameters.")
s3_destination = f"s3://{s3_bucket}/" s3_destination = f"s3://{s3_bucket}/"
logger.info(f"Starting S3 upload loop. Watching source '{READY_PATH}' for delivery to '{s3_destination}'.") logger.info(f"Starting S3 upload loop. Watching source '{READY_PATH}' for delivery to '{s3_destination}'.")
@ -328,6 +336,21 @@ with DAG(
2. Ansible updates an Airflow Variable named `s3_worker_hostnames` with a JSON list of all active uploader workers (typically dlXXX machines). Each worker listens to its own queue (e.g., `queue-dl-dl001`). 2. Ansible updates an Airflow Variable named `s3_worker_hostnames` with a JSON list of all active uploader workers (typically dlXXX machines). Each worker listens to its own queue (e.g., `queue-dl-dl001`).
3. This DAG reads the variable on manual trigger or after a pause/resume cycle to create the dynamic tasks. This allows for easy inspection of per-worker logs and status from the Airflow UI. 3. This DAG reads the variable on manual trigger or after a pause/resume cycle to create the dynamic tasks. This allows for easy inspection of per-worker logs and status from the Airflow UI.
4. Each dynamic task watches a shared folder (`/opt/airflow/downloadfiles/videos/ready`). Download workers place completed videos into timestamped sub-folders (e.g., `20241122T1050`). The uploader processes these 10-minute batches, copying them to S3 with `s5cmd` and then deleting the source directories. This design avoids race conditions and improves performance. 4. Each dynamic task watches a shared folder (`/opt/airflow/downloadfiles/videos/ready`). Download workers place completed videos into timestamped sub-folders (e.g., `20241122T1050`). The uploader processes these 10-minute batches, copying them to S3 with `s5cmd` and then deleting the source directories. This design avoids race conditions and improves performance.
#### Why use 10-minute batch folders?
While an `mv` command (atomic on the same filesystem) is sufficient to ensure a single video directory is complete when it appears in the `ready` folder, the batching system solves higher-level concurrency and efficiency problems in a high-throughput environment.
- **Concurrency Management**: The uploader needs to process a discrete *set* of videos. By working on batches from a *previous* time window (e.g., uploading the `10:40` batch after `10:50`), it guarantees that no new files will be added to that batch while it's being processed. This creates a clean, reliable unit of work and prevents the uploader from missing videos that are moved in while it's compiling its list.
- **Bulk Operation Efficiency**: It is far more efficient to upload hundreds of videos in a single bulk command than one by one. The batching system allows videos to accumulate, and the uploader sends them all to S3 in one highly optimized `s5cmd run` command. Similarly, after a successful upload, the uploader can delete the single parent batch directory, which is much faster than deleting hundreds of individual video folders.
- **Continuous Operation**: The uploader task is a long-running loop. If processing a batch takes longer than 10 minutes (e.g., due to a large volume of videos or slow network), the uploader will continue working on that batch until it is complete. It only sleeps when it has processed all available completed batches and is waiting for new ones to become ready.
#### Cleanup Method: `rsync` vs `shutil.rmtree`
The cleanup process uses the `rsync` empty-folder trick to delete the contents of the batch directory before removing the directory itself. This is a deliberate performance optimization. The command is effectively: `rsync -a --delete /path/to/empty/ /path/to/delete/`.
- Python's `shutil.rmtree` can be slow as it makes an individual `os.remove()` system call for every file.
- The `rsync` method is a well-known and highly efficient alternative for this scenario, as `rsync` is a mature C program optimized for these operations. More details on this performance difference can be found here: https://stackoverflow.com/questions/5470939/why-is-shutil-rmtree-so-slow
""", """,
params={ params={
'mode': Param( 'mode': Param(
@ -339,9 +362,15 @@ with DAG(
description="If True, the DAG will perform all steps except the actual upload and cleanup. `s5cmd` will be run with `--dry-run`, and the final directory removal will be skipped. Log messages will indicate what would have happened." description="If True, the DAG will perform all steps except the actual upload and cleanup. `s5cmd` will be run with `--dry-run`, and the final directory removal will be skipped. Log messages will indicate what would have happened."
), ),
'concurrency': Param(10, type="integer", title="s5cmd Concurrency"), 'concurrency': Param(10, type="integer", title="s5cmd Concurrency"),
'sleep_if_no_videos_min': Param(10, type="integer", title="Sleep if Idle (minutes)", description="How many minutes the task should sleep if no videos are found to upload."), 'sleep_if_no_videos_min': Param(5, type="integer", title="Sleep if Idle (minutes)", description="How many minutes the task should sleep if no videos are found to upload. This should be less than any external timeout (e.g., Celery's worker_proc_timeout)."),
'batch_completion_wait_min': Param(0, type="integer", title="Batch Completion Wait (minutes)", description="How many minutes to wait after a 10-minute batch window closes before considering it for upload. Default is 0, which processes the current batch immediately. A value of 10 restores the old behavior of waiting for the next 10-minute window."), 'batch_completion_wait_min': Param(0, type="integer", title="Batch Completion Wait (minutes)", description="How many minutes to wait after a 10-minute batch window closes before considering it for upload. Default is 0, which processes the current batch immediately. A value of 10 restores the old behavior of waiting for the next 10-minute window."),
's3_conn_id': Param('s3_delivery_connection', type="string", title="S3 Connection ID", description="The Airflow connection ID for the S3-compatible storage. If this connection is invalid or missing, the task will fall back to environment variables."), 's3_conn_id': Param('s3_delivery_connection', type="string", title="S3 Connection ID", description="The Airflow connection ID for the S3-compatible storage. If this connection is invalid or missing, the task will fall back to environment variables."),
's3_bucket_name': Param(
'videos',
type="string",
title="S3 Bucket Name",
description="The name of the S3 bucket to upload to. Common values are 'videos' or 'videos-prod'."
),
} }
) as dag: ) as dag:
@ -410,7 +439,8 @@ with DAG(
# Create a task for each worker, pinned to its specific queue # Create a task for each worker, pinned to its specific queue
upload_task = task( upload_task = task(
task_id=f'upload_batch_on_{task_id_hostname}', task_id=f'upload_batch_on_{task_id_hostname}',
queue=f'queue-s3-{hostname}' queue=f'queue-s3-{hostname}',
execution_timeout=timedelta(days=1),
)(run_s3_upload_batch)() )(run_s3_upload_batch)()
worker_tasks.append(upload_task) worker_tasks.append(upload_task)

View File

@ -138,6 +138,7 @@ def generate_configs():
logging.info(f"Service role for generation: '{service_role}'") logging.info(f"Service role for generation: '{service_role}'")
# --- Camoufox Configuration (only for worker/all-in-one roles) --- # --- Camoufox Configuration (only for worker/all-in-one roles) ---
logging.info("--- Camoufox (Remote Browser) Configuration ---")
camoufox_proxies = [] camoufox_proxies = []
expanded_camoufox_proxies_str = "" expanded_camoufox_proxies_str = ""
if service_role != 'management': if service_role != 'management':
@ -210,7 +211,7 @@ def generate_configs():
logging.info("This file maps each proxy to a list of WebSocket endpoints for Camoufox.") logging.info("This file maps each proxy to a list of WebSocket endpoints for Camoufox.")
logging.info("The token_generator uses this map to connect to the correct remote browser.") logging.info("The token_generator uses this map to connect to the correct remote browser.")
else: else:
logging.info("Skipping Camoufox configuration generation for 'management' role.") logging.info("Skipping Camoufox configuration generation.")
# --- Generate docker-compose-ytdlp-ops.yaml --- # --- Generate docker-compose-ytdlp-ops.yaml ---
ytdlp_ops_template = env.get_template('docker-compose-ytdlp-ops.yaml.j2') ytdlp_ops_template = env.get_template('docker-compose-ytdlp-ops.yaml.j2')

View File

@ -1,64 +1,46 @@
# Ansible for YT-DLP Cluster # Ansible Deployment for YT-DLP Cluster
This directory contains the Ansible playbooks, roles, and configurations for deploying and managing the YT-DLP Airflow cluster. This document provides an overview of the Ansible playbooks used to deploy and manage the YT-DLP Airflow cluster.
**Note:** All commands should be run from the project root, not from within this directory. ## Main Playbooks
Example: `ansible-playbook ansible/playbook-full.yml`
## Full Deployment These are the primary entry points for cluster management.
### Deploy entire cluster with proxies (recommended for new setups): - `playbook-full-with-proxies.yml`: **(Recommended Entry Point)** Deploys shadowsocks proxies and then the entire application stack.
- `playbook-full.yml`: Deploys the entire application stack (master and workers) without touching proxies.
- `playbook-master.yml`: Deploys/updates only the Airflow master node.
- `playbook-worker.yml`: Deploys/updates all Airflow worker nodes.
- `playbook-proxies.yml`: Deploys/updates only the shadowsocks proxy services on all nodes.
```bash ## Component & Utility Playbooks
ansible-playbook ansible/playbook-full-with-proxies.yml
```
### Deploy cluster without proxies: These playbooks are used for more specific tasks or are called by the main playbooks.
```bash ### Core Deployment Logic
ansible-playbook ansible/playbook-full.yml - `roles/airflow-master/tasks/main.yml`: Contains all tasks for setting up the Airflow master services.
``` - `roles/airflow-worker/tasks/main.yml`: Contains all tasks for setting up the Airflow worker services.
- `roles/ytdlp-master/tasks/main.yml`: Contains tasks for setting up the YT-DLP management services on the master.
- `roles/ytdlp-worker/tasks/main.yml`: Contains tasks for setting up YT-DLP, Camoufox, and other worker-specific services.
## Targeted Deployments ### Utility & Maintenance
- `playbook-dags.yml`: Quickly syncs only the `dags/` and `config/` directories to all nodes.
- `playbook-hook.yml`: Syncs Airflow custom hooks and restarts relevant services.
- `playbook-sync-local.yml`: Syncs local development files (e.g., `ytops_client`, `pangramia`) to workers.
- `playbooks/pause_worker.yml`: Pauses a worker by creating a lock file, preventing it from taking new tasks.
- `playbooks/resume_worker.yml`: Resumes a paused worker by removing the lock file.
- `playbooks/playbook-bgutils-start.yml`: Starts the `bgutil-provider` container.
- `playbooks/playbook-bgutils-stop.yml`: Stops the `bgutil-provider` container.
- `playbook-update-s3-vars.yml`: Updates the `s3_delivery_connection` in Airflow.
- `playbook-update-regression-script.yml`: Updates the `regression.py` script on the master.
### Deploy only to master node: ### Deprecated
- `playbook-dl.yml`: Older worker deployment logic. Superseded by `playbook-worker.yml`.
- `playbook-depricated.dl.yml`: Older worker deployment logic. Superseded by `playbook-worker.yml`.
```bash ## Current Goal: Disable Camoufox & Enable Aria2
ansible-playbook ansible/playbook-master.yml --limit="af-test"
```
### Deploy only to worker nodes: The current objective is to modify the worker deployment (`playbook-worker.yml` and its role `roles/ytdlp-worker/tasks/main.yml`) to:
1. **Disable Camoufox**: Prevent the build, configuration generation, and startup of all `camoufox` services.
2. **Enable Aria2**: Ensure the `aria2-pro` service is built and started correctly on worker nodes.
```bash The `playbook-worker.yml` has already been updated to build the `aria2-pro` image. The next steps will involve modifying `roles/ytdlp-worker/tasks/main.yml` to remove the Camoufox-related tasks.
ansible-playbook ansible/playbook-worker.yml
```
## DAGs Only Deployment
To update only DAG files and configurations:
```bash
ansible-playbook ansible/playbook-dags.yml
```
## Managing Worker State (Pause/Resume)
The system allows for gracefully pausing a worker to prevent it from picking up new tasks. This is useful for maintenance or decommissioning a node. The mechanism uses a lock file (`AIRFLOW.PREVENT_URL_PULL.lock`) on the worker host.
### To Pause a Worker
This command creates the lock file, causing the `ytdlp_ops_dispatcher` DAG to skip task execution on this host.
```bash
# Replace "worker-hostname" with the target host from your inventory
ansible-playbook ansible/playbooks/pause_worker.yml --limit "worker-hostname"
```
### To Resume a Worker
This command removes the lock file, allowing the worker to resume picking up tasks.
```bash
# Replace "worker-hostname" with the target host from your inventory
ansible-playbook ansible/playbooks/resume_worker.yml --limit "worker-hostname"
```

View File

@ -13,3 +13,4 @@ vault_s3_delivery_secret_access_key: "33b155c5d2ea4fccb0faeeefb420d7ac"
vault_s3_delivery_endpoint: "https://s3.rusonyxcloud.ru" vault_s3_delivery_endpoint: "https://s3.rusonyxcloud.ru"
vault_s3_delivery_bucket: "videos" vault_s3_delivery_bucket: "videos"
vault_s3_delivery_aws_region: "ru-msk" vault_s3_delivery_aws_region: "ru-msk"
vault_aria2_rpc_secret: "aR1a2_sEcReT_pWd_f0r_yTd1p"

View File

@ -11,7 +11,7 @@
src: "../airflow/dags/" src: "../airflow/dags/"
dest: /srv/airflow_master/dags/ dest: /srv/airflow_master/dags/
archive: yes archive: yes
delete: yes delete: no
rsync_path: "sudo rsync" rsync_path: "sudo rsync"
rsync_opts: rsync_opts:
- "--exclude=__pycache__/" - "--exclude=__pycache__/"
@ -42,7 +42,7 @@
src: "../airflow/dags/" src: "../airflow/dags/"
dest: /srv/airflow_dl_worker/dags/ dest: /srv/airflow_dl_worker/dags/
archive: yes archive: yes
delete: yes delete: no
rsync_path: "sudo rsync" rsync_path: "sudo rsync"
rsync_opts: rsync_opts:
- "--exclude=__pycache__/" - "--exclude=__pycache__/"

View File

@ -91,7 +91,6 @@
files: files:
- configs/docker-compose-dl.yaml - configs/docker-compose-dl.yaml
- configs/docker-compose-ytdlp-ops.yaml - configs/docker-compose-ytdlp-ops.yaml
- configs/docker-compose.camoufox.yaml
state: present state: present
remove_orphans: true remove_orphans: true
pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}" pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"

View File

@ -216,6 +216,17 @@
become: yes become: yes
become_user: "{{ ansible_user }}" become_user: "{{ ansible_user }}"
- name: Sync aria2-pro-docker to worker for build context
ansible.posix.synchronize:
src: "../airflow/aria2-pro-docker/"
dest: "{{ airflow_worker_dir }}/aria2-pro-docker/"
rsync_opts:
- "--delete"
recursive: yes
perms: yes
become: yes
become_user: "{{ ansible_user }}"
- name: Ensure bin directory exists on worker for build context - name: Ensure bin directory exists on worker for build context
ansible.builtin.file: ansible.builtin.file:
path: "{{ airflow_worker_dir }}/bin" path: "{{ airflow_worker_dir }}/bin"
@ -275,15 +286,6 @@
- name: Include Docker health check tasks - name: Include Docker health check tasks
include_tasks: tasks/docker_health_check.yml include_tasks: tasks/docker_health_check.yml
- name: Build local Docker images (e.g., camoufox)
ansible.builtin.command: >
docker compose --project-directory . -f configs/docker-compose-ytdlp-ops.yaml build
args:
chdir: "{{ airflow_worker_dir }}"
become: yes
become_user: "{{ ansible_user }}"
register: docker_build_result
changed_when: "'Building' in docker_build_result.stdout or 'writing image' in docker_build_result.stdout"
- name: Pull pre-built Docker images for ytdlp-ops services - name: Pull pre-built Docker images for ytdlp-ops services
ansible.builtin.command: > ansible.builtin.command: >

View File

@ -47,7 +47,6 @@
- "docker-compose-ytdlp-ops.yaml.j2" - "docker-compose-ytdlp-ops.yaml.j2"
- "docker-compose.config-generate.yaml" - "docker-compose.config-generate.yaml"
- "envoy.yaml.j2" - "envoy.yaml.j2"
- "docker-compose.camoufox.yaml.j2"
- name: Create .env file for YT-DLP master service - name: Create .env file for YT-DLP master service
template: template:
@ -117,19 +116,6 @@
recurse: yes recurse: yes
become: yes become: yes
- name: Create dummy camoufox compose file for master to prevent errors
copy:
content: |
# This is a placeholder file.
# The master node does not run Camoufox, but the shared docker-compose-ytdlp-ops.yaml
# may unconditionally include this file, causing an error if it's missing.
# This file provides an empty services block to satisfy the include.
services: {}
dest: "{{ airflow_master_dir }}/configs/docker-compose.camoufox.yaml"
mode: "{{ file_permissions }}"
owner: "{{ ssh_user }}"
group: "{{ deploy_group }}"
become: yes
- name: Check for shadowsocks-rust proxy compose file - name: Check for shadowsocks-rust proxy compose file
stat: stat:

View File

@ -66,18 +66,7 @@
- name: "Log: Syncing YT-DLP service files" - name: "Log: Syncing YT-DLP service files"
debug: debug:
msg: "Syncing YT-DLP service components (config generator, envoy/camoufox templates) to the worker node." msg: "Syncing YT-DLP service components (config generator, envoy templates) to the worker node."
- name: Sync YT-DLP service files to worker
synchronize:
src: "../{{ item }}"
dest: "{{ airflow_worker_dir }}/"
archive: yes
recursive: yes
rsync_path: "sudo rsync"
rsync_opts: "{{ rsync_default_opts }}"
loop:
- "airflow/camoufox"
- name: Sync YT-DLP config generator to worker - name: Sync YT-DLP config generator to worker
synchronize: synchronize:
@ -99,7 +88,6 @@
- "docker-compose-ytdlp-ops.yaml.j2" - "docker-compose-ytdlp-ops.yaml.j2"
- "docker-compose.config-generate.yaml" - "docker-compose.config-generate.yaml"
- "envoy.yaml.j2" - "envoy.yaml.j2"
- "docker-compose.camoufox.yaml.j2"
- name: Sync Airflow build context to worker - name: Sync Airflow build context to worker
synchronize: synchronize:
@ -209,19 +197,35 @@
force_source: true force_source: true
when: not fast_deploy | default(false) when: not fast_deploy | default(false)
- name: "Log: Building Camoufox (remote browser) image" - name: "Log: Building aria2-pro image"
debug: debug:
msg: "Building the Camoufox image locally. This image provides remote-controlled Firefox browsers for token generation." msg: "Building the aria2-pro image locally. This image provides the download manager."
- name: Build Camoufox image from local Dockerfile
community.docker.docker_image:
name: "camoufox:latest"
build:
path: "{{ airflow_worker_dir }}/camoufox"
source: build
force_source: true
when: not fast_deploy | default(false) when: not fast_deploy | default(false)
- name: Build aria2-pro image from docker-compose
ansible.builtin.command: >
docker compose -f configs/docker-compose.airflow.yml build aria2-pro
args:
chdir: "{{ airflow_worker_dir }}"
become: yes
become_user: "{{ ansible_user }}"
register: docker_build_result
changed_when: "'Building' in docker_build_result.stdout or 'writing image' in docker_build_result.stdout"
when: not fast_deploy | default(false)
# - name: "Log: Building Camoufox (remote browser) image"
# debug:
# msg: "Building the Camoufox image locally. This image provides remote-controlled Firefox browsers for token generation."
#
# - name: Build Camoufox image from local Dockerfile
# community.docker.docker_image:
# name: "camoufox:latest"
# build:
# path: "{{ airflow_worker_dir }}/camoufox"
# source: build
# force_source: true
# when: not fast_deploy | default(false)
- name: Ensure correct permissions for build context after generation - name: Ensure correct permissions for build context after generation
file: file:
path: "{{ airflow_worker_dir }}" path: "{{ airflow_worker_dir }}"
@ -245,7 +249,6 @@
project_src: "{{ airflow_worker_dir }}" project_src: "{{ airflow_worker_dir }}"
files: files:
- "configs/docker-compose-ytdlp-ops.yaml" - "configs/docker-compose-ytdlp-ops.yaml"
- "configs/docker-compose.camoufox.yaml"
- "configs/docker-compose.airflow.yml" - "configs/docker-compose.airflow.yml"
state: absent state: absent
remove_volumes: true # Corresponds to docker compose down -v remove_volumes: true # Corresponds to docker compose down -v
@ -259,20 +262,19 @@
- name: "Log: Starting all worker services" - name: "Log: Starting all worker services"
debug: debug:
msg: "Starting all worker services: ytdlp-ops, camoufox, and airflow-worker." msg: "Starting all worker services: ytdlp-ops, and airflow-worker."
- name: Start all worker services - name: Start all worker services
community.docker.docker_compose_v2: community.docker.docker_compose_v2:
project_src: "{{ airflow_worker_dir }}" project_src: "{{ airflow_worker_dir }}"
files: files:
- "configs/docker-compose-ytdlp-ops.yaml" - "configs/docker-compose-ytdlp-ops.yaml"
- "configs/docker-compose.camoufox.yaml"
- "configs/docker-compose.airflow.yml" - "configs/docker-compose.airflow.yml"
state: present state: present
remove_orphans: true remove_orphans: true
pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}" pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"
recreate: always # Corresponds to --force-recreate recreate: always # Corresponds to --force-recreate
- name: Include camoufox verification tasks # - name: Include camoufox verification tasks
include_tasks: ../../../tasks/verify_camoufox.yml # include_tasks: ../../../tasks/verify_camoufox.yml
when: not fast_deploy | default(false) # when: not fast_deploy | default(false)

View File

@ -27,19 +27,55 @@ execution_control:
info_json_generation_policy: info_json_generation_policy:
# Use a standard client. The server will handle token generation. # Use a standard client. The server will handle token generation.
client: web client: tv_simply
--- ---
# Policy: Test download specific DASH formats from a folder of info.jsons. # Policy: Full-stack test with visitor ID rotation and test download.
# This policy uses a single worker to test-download a list of video-only DASH # This policy uses a single worker to fetch info.json files for a list of URLs,
# formats from a directory of existing info.json files. It only downloads the # and then immediately performs a test download (first 10KB) of specified formats.
# first 10KB of each format and sleeps between each file. # It simulates user churn by creating a new profile (and thus a new visitor_id and POT)
name: download_dashy_formats_test # every 250 requests. A short sleep is used between requests.
name: full_stack_with_visitor_id_rotation
settings:
mode: full_stack
urls_file: "urls.txt" # Placeholder, should be overridden with --set
info_json_script: "bin/ytops-client get-info"
# Use the modern profile management system to rotate visitor_id.
profile_mode: per_worker_with_rotation
profile_management:
prefix: "visitor_rotator"
# Rotate to a new profile generation after 250 requests.
max_requests_per_profile: 250
execution_control:
run_until: { cycles: 1 } # Run through the URL list once.
workers: 1 # Run with a single worker thread.
# A short, fixed sleep between each info.json request.
sleep_between_tasks: { min_seconds: 0.75, max_seconds: 0.75 }
info_json_generation_policy:
# Use a standard client. The server will handle token generation.
client: tv_simply
download_policy:
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy"
downloader: "native-py"
extra_args: '--test --cleanup'
output_dir: "downloads/fetch_and_test"
sleep_between_formats: { min_seconds: 6, max_seconds: 6 }
---
# Policy: Download-only test from a fetch folder (Batch Mode).
# This policy scans a directory of existing info.json files once, and performs
# a test download (first 10KB) for specific formats. It is designed to run as
# a batch job after a 'fetch_only' policy has completed.
name: download_only_test_from_fetch_folder
settings: settings:
mode: download_only mode: download_only
# Directory of info.json files to process. # Directory of info.json files to process.
info_json_dir: "fetched_info_jsons/visitor_id_rotation" # Assumes output from the above policy info_json_dir: "fetched_info_jsons/visitor_id_rotation" # Assumes output from 'fetch_with_visitor_id_rotation'
execution_control: execution_control:
run_until: { cycles: 1 } # Run through the info.json directory once. run_until: { cycles: 1 } # Run through the info.json directory once.
@ -49,10 +85,42 @@ execution_control:
download_policy: download_policy:
# A specific list of video-only DASH formats to test. # A specific list of video-only DASH formats to test.
# The "-dashy" suffix is illustrative; the format IDs must exist in the info.json.
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy" formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy"
# Use the native Python downloader for better performance and control.
downloader: "native-py" downloader: "native-py"
# Pass extra arguments to yt-dlp to perform a "test" download (first 10KB). # Pass extra arguments to perform a "test" download.
extra_args: '--download-sections "*0-10240"' extra_args: '--test --cleanup'
output_dir: "downloads/dash_test" output_dir: "downloads/dash_test"
---
# Policy: Live download from a watch folder (Continuous Mode).
# This policy continuously watches a directory for new info.json files and
# processes them as they appear. It is designed to work as the second stage
# of a pipeline, consuming files generated by a 'fetch_only' policy.
name: live_download_from_watch_folder
settings:
mode: download_only
info_json_dir: "live_info_json" # A different directory for the live pipeline
directory_scan_mode: continuous
mark_processed_files: true # Rename files to *.processed to avoid re-downloading.
max_files_per_cycle: 50 # Process up to 50 new files each time it checks.
sleep_if_no_new_files_seconds: 15
execution_control:
# For 'continuous' mode, a time-based run_until is typical.
# {cycles: 1} will scan once, process new files, and exit.
# To run for 2 hours, for example, use: run_until: { minutes: 120 }
run_until: { cycles: 1 }
workers: 4 # Use a few workers to process files in parallel.
# sleep_between_tasks controls the pause between processing different info.json files.
# To pause before each download attempt starts, use 'pause_before_download_seconds'
# in the download_policy section below.
sleep_between_tasks: { min_seconds: 0, max_seconds: 0 }
download_policy:
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy"
downloader: "native-py"
# Example: Pause for a few seconds before starting each download attempt.
# pause_before_download_seconds: 2
extra_args: '--test --cleanup'
output_dir: "downloads/live_dash_test"

View File

@ -0,0 +1,84 @@
# This file contains policies for testing ban rates and profile survival
# under high request counts.
---
# Policy: Single Profile Ban Test (500 Requests)
# This policy uses a single worker and a single, non-rotating profile to make
# 500 consecutive info.json requests. It is designed to test if and when a
# single profile/visitor_id gets banned or rate-limited by YouTube.
#
# It explicitly disables the server's automatic visitor ID rotation to ensure
# the same identity is used for all requests.
#
# The test will stop if it encounters 3 errors within any 1-minute window,
# or a total of 8 errors within any 60-minute window.
name: single_profile_ban_test_500
settings:
mode: fetch_only
urls_file: "urls.txt" # Override with --set settings.urls_file=...
info_json_script: "bin/ytops-client get-info"
save_info_json_dir: "fetched_info_jsons/ban_test_single_profile"
# Use one worker with one profile that does not rotate automatically.
profile_mode: per_worker_with_rotation
profile_management:
prefix: "ban_test_user"
# Set a high request limit to prevent the orchestrator from rotating the profile.
max_requests_per_profile: 1000
execution_control:
run_until: { requests: 500 } # Stop after 500 total requests.
workers: 1
sleep_between_tasks: { min_seconds: 1, max_seconds: 2 }
info_json_generation_policy:
client: "tv_simply" # A typical client for this kind of test.
# Explicitly disable the server's visitor ID rotation mechanism.
request_params:
session_params:
visitor_rotation_threshold: 0
stop_conditions:
# Stop if we get 3 or more errors in any 1-minute window (rapid failure).
on_error_rate: { max_errors: 3, per_minutes: 1 }
# Stop if we get 8 or more 403 errors in any 60-minute window (ban detection).
on_cumulative_403: { max_errors: 8, per_minutes: 60 }
---
# Policy: Multi-Profile Survival Test
# This policy uses 5 parallel workers, each with its own unique profile.
# It tests whether using multiple profiles with the server's default automatic
# visitor ID rotation (every 250 requests) can sustain a high request rate
# without getting banned.
#
# The test will run until 1250 total requests have been made (250 per worker),
# which should trigger one rotation for each profile.
name: multi_profile_survival_test
settings:
mode: fetch_only
urls_file: "urls.txt" # Override with --set settings.urls_file=...
info_json_script: "bin/ytops-client get-info"
save_info_json_dir: "fetched_info_jsons/ban_test_multi_profile"
# Use 5 workers, each getting its own rotating profile.
profile_mode: per_worker_with_rotation
profile_management:
prefix: "survival_test_user"
# Use the default rotation threshold of 250 requests per profile.
max_requests_per_profile: 250
execution_control:
run_until: { requests: 1250 } # 5 workers * 250 requests/rotation = 1250 total.
workers: 5
sleep_between_tasks: { min_seconds: 1, max_seconds: 2 }
info_json_generation_policy:
client: "tv_simply"
# No request_params are needed here; we want to use the server's default
# visitor ID rotation behavior.
stop_conditions:
# Stop if we get 3 or more errors in any 1-minute window (rapid failure).
on_error_rate: { max_errors: 3, per_minutes: 1 }
# Stop if we get 8 or more 403 errors in any 60-minute window (ban detection).
on_cumulative_403: { max_errors: 8, per_minutes: 60 }

View File

@ -27,7 +27,10 @@ def main():
if last_arg.startswith('-') and len(last_arg) == 11: if last_arg.startswith('-') and len(last_arg) == 11:
import re import re
if re.fullmatch(r'-[a-zA-Z0-9_-]{10}', last_arg): if re.fullmatch(r'-[a-zA-Z0-9_-]{10}', last_arg):
sys.argv.insert(len(sys.argv) - 1, '--') # Only insert '--' if it's not already the preceding argument.
# This prevents `stress_policy_tool` which already adds '--' from causing an error.
if sys.argv[-2] != '--':
sys.argv.insert(len(sys.argv) - 1, '--')
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="YT Ops Client Tools", description="YT Ops Client Tools",

View File

@ -12,14 +12,16 @@ import glob
import shutil import shutil
import re import re
import shlex import shlex
import threading
import time import time
from urllib.parse import urljoin from urllib.parse import urljoin
try: try:
import aria2p import aria2p
from aria2p.utils import human_readable_bytes from aria2p.utils import human_readable_bytes
import yt_dlp
except ImportError: except ImportError:
print("aria2p is not installed. Please install it with: pip install aria2p", file=sys.stderr) print("aria2p or yt-dlp is not installed. Please install them with: pip install aria2p yt-dlp", file=sys.stderr)
sys.exit(1) sys.exit(1)
logger = logging.getLogger('download_aria_tool') logger = logging.getLogger('download_aria_tool')
@ -61,15 +63,18 @@ cat latest-info.json | yt-ops-client download aria-rpc -f "299/137" \\
parser.add_argument('--aria-host', default='localhost', help='The host of the aria2c RPC server. Default: localhost.') parser.add_argument('--aria-host', default='localhost', help='The host of the aria2c RPC server. Default: localhost.')
parser.add_argument('--aria-port', type=int, default=6800, help='The port of the aria2c RPC server. Default: 6800.') parser.add_argument('--aria-port', type=int, default=6800, help='The port of the aria2c RPC server. Default: 6800.')
parser.add_argument('--aria-secret', help='The secret token for the aria2c RPC server (often required, e.g., "SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX").') parser.add_argument('--aria-secret', help='The secret token for the aria2c RPC server (often required, e.g., "SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX").')
parser.add_argument('--proxy', help='Proxy to use for the download, e.g., "socks5://127.0.0.1:1080".') parser.add_argument('--proxy', help='Proxy to use for the download, e.g., "socks5://127.0.0.1:1080". This sets the "all-proxy" option in aria2c.')
parser.add_argument('--downloader-args', help='Arguments for aria2c, in yt-dlp format (e.g., "aria2c:[-x 8, -k 1M]").') parser.add_argument('--downloader-args', help='Arguments for aria2c, in yt-dlp format (e.g., "aria2c:[-x 8, -k 1M]").')
parser.add_argument('--wait', action='store_true', help='Wait for the download to complete and report its status. Note: This makes the operation synchronous and will block until the download finishes.') parser.add_argument('--wait', action='store_true', help='Wait for the download to complete and report its status. Note: This makes the operation synchronous and will block until the download finishes.')
parser.add_argument('--wait-timeout', help='Timeout in seconds for waiting on downloads. Use "auto" to calculate based on a minimum speed of 200KiB/s. Requires --wait. Default: no timeout.') parser.add_argument('--wait-timeout', help='Timeout in seconds for waiting on downloads. Use "auto" to calculate based on a minimum speed of 200KiB/s. Requires --wait. Default: no timeout.')
parser.add_argument('--max-concurrent-fragments', type=int, default=8, help='Maximum number of fragments to download concurrently when using --wait. Mimics aria2c\'s -j option. Default: 8.')
parser.add_argument('--auto-merge-fragments', action='store_true', help='Automatically merge fragments after download. Requires --wait and assumes the script has filesystem access to the aria2c host.') parser.add_argument('--auto-merge-fragments', action='store_true', help='Automatically merge fragments after download. Requires --wait and assumes the script has filesystem access to the aria2c host.')
parser.add_argument('--remove-fragments-after-merge', action='store_true', help='Delete individual fragment files after a successful merge. Requires --auto-merge-fragments.') parser.add_argument('--remove-fragments-after-merge', action='store_true', help='Delete individual fragment files after a successful merge. Requires --auto-merge-fragments.')
parser.add_argument('--cleanup', action='store_true', help='After a successful download, remove the final file(s) from the filesystem. For fragmented downloads, this implies --remove-fragments-after-merge.') parser.add_argument('--cleanup', action='store_true', help='After a successful download, remove the final file(s) from the filesystem. For fragmented downloads, this implies --remove-fragments-after-merge.')
parser.add_argument('--remove-on-complete', action=argparse.BooleanOptionalAction, default=True, help='Remove the download from aria2c history on successful completion. Use --no-remove-on-complete to disable. May fail on older aria2c daemons.') parser.add_argument('--remove-on-complete', action=argparse.BooleanOptionalAction, default=True, help='Remove the download from aria2c history on successful completion. Use --no-remove-on-complete to disable. May fail on older aria2c daemons.')
parser.add_argument('--purge-on-complete', action='store_true', help='Use aria2.purgeDownloadResult to clear ALL completed/failed downloads from history on success. Use as a workaround for older daemons.') parser.add_argument('--purge-on-complete', action='store_true', help='Use aria2.purgeDownloadResult to clear ALL completed/failed downloads from history on success. Use as a workaround for older daemons.')
parser.add_argument('--add-header', action='append', help='Add a custom HTTP header for the download. Format: "Key: Value". Can be used multiple times.')
parser.add_argument('--user-agent', help='Specify a custom User-Agent. Overrides any User-Agent from info.json, --add-header, or the default.')
parser.add_argument('--verbose', action='store_true', help='Enable verbose output for this script.') parser.add_argument('--verbose', action='store_true', help='Enable verbose output for this script.')
return parser return parser
@ -101,6 +106,10 @@ def parse_aria_error(download):
if not error_message: if not error_message:
return f"Unknown aria2c error (Code: {error_code})" return f"Unknown aria2c error (Code: {error_code})"
# Handle specific error codes that provide more context
if error_code == 24: # Authorization failed
return f"HTTP Authorization Failed (Error 24). The URL may have expired or requires valid cookies/headers. Raw message: {error_message}"
# Check for common HTTP errors in the message # Check for common HTTP errors in the message
http_status_match = re.search(r'HTTP status (\d+)', error_message) http_status_match = re.search(r'HTTP status (\d+)', error_message)
if http_status_match: if http_status_match:
@ -144,6 +153,8 @@ def parse_aria_args_to_options(args_str):
parser.add_argument('-x', '--max-connection-per-server') parser.add_argument('-x', '--max-connection-per-server')
parser.add_argument('-k', '--min-split-size') parser.add_argument('-k', '--min-split-size')
parser.add_argument('-s', '--split') parser.add_argument('-s', '--split')
parser.add_argument('--http-proxy')
parser.add_argument('--https-proxy')
parser.add_argument('--all-proxy') parser.add_argument('--all-proxy')
try: try:
@ -151,8 +162,10 @@ def parse_aria_args_to_options(args_str):
known_args, unknown_args = parser.parse_known_args(arg_list) known_args, unknown_args = parser.parse_known_args(arg_list)
if unknown_args: if unknown_args:
logger.warning(f"Ignoring unknown arguments in --downloader-args: {unknown_args}") logger.warning(f"Ignoring unknown arguments in --downloader-args: {unknown_args}")
# Convert to dict, removing None values # Convert to dict, removing None values.
return {k: v for k, v in vars(known_args).items() if v is not None} # Convert to dict, removing None values, and converting underscores back to hyphens
# to match the option format expected by aria2c's RPC interface.
return {k.replace('_', '-'): v for k, v in vars(known_args).items() if v is not None}
except Exception: except Exception:
logger.warning(f"Failed to parse arguments inside --downloader-args: '{inner_args_str}'") logger.warning(f"Failed to parse arguments inside --downloader-args: '{inner_args_str}'")
return {} return {}
@ -161,6 +174,9 @@ def parse_aria_args_to_options(args_str):
def main_download_aria(args): def main_download_aria(args):
"""Main logic for the 'download-aria' command.""" """Main logic for the 'download-aria' command."""
log_level = logging.DEBUG if args.verbose else logging.INFO log_level = logging.DEBUG if args.verbose else logging.INFO
# Reconfigure root logger to ensure our settings are applied.
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
logging.basicConfig(level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', stream=sys.stderr) logging.basicConfig(level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', stream=sys.stderr)
if args.remove_fragments_after_merge and not args.auto_merge_fragments: if args.remove_fragments_after_merge and not args.auto_merge_fragments:
@ -198,25 +214,43 @@ def main_download_aria(args):
logger.error(f"Failed to parse info.json from {input_source_name}. Is the input valid JSON?") logger.error(f"Failed to parse info.json from {input_source_name}. Is the input valid JSON?")
return 1 return 1
# Find the requested format, supporting yt-dlp style selectors # Find the requested format using yt-dlp's own selection logic
target_format = None try:
# A format selector can be a comma-separated list of preferences, # We don't need a full ydl instance, just the format selection logic.
# where each preference can be a slash-separated list of format_ids. ydl = yt_dlp.YoutubeDL({'quiet': True, 'logger': logger, 'format': args.format})
# e.g., "299/137/136,140" means try 299, then 137, then 136, then 140. formats = info_data.get('formats', [])
format_preferences = [item.strip() for sublist in (i.split('/') for i in args.format.split(',')) for item in sublist if item.strip()] selector = ydl.build_format_selector(args.format)
ctx = {
'formats': formats,
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats)
or all(f.get('acodec') == 'none' for f in formats)),
}
selected_formats = list(selector(ctx))
except Exception as e:
logger.error(f"Failed to select format with selector '{args.format}': {e}", exc_info=args.verbose)
return 1
available_formats_map = {f['format_id']: f for f in info_data.get('formats', []) if 'format_id' in f} if not selected_formats:
for format_id in format_preferences:
if format_id in available_formats_map:
target_format = available_formats_map[format_id]
logger.info(f"Selected format ID '{format_id}' from selector '{args.format}'.")
break
if not target_format:
logger.error(f"No suitable format found for selector '{args.format}' in info.json.") logger.error(f"No suitable format found for selector '{args.format}' in info.json.")
return 1 return 1
# The selector might return multiple results if ',' is used. We'll process the first one.
target_format = selected_formats[0]
if len(selected_formats) > 1:
logger.warning(f"Format selector '{args.format}' resolved to multiple format combinations. Only the first one will be downloaded.")
formats_to_download = target_format.get('requested_formats', [target_format])
if len(formats_to_download) > 1:
logger.warning(
f"The selected format is a combination of {len(formats_to_download)} streams. "
f"This tool does not support merging separate video/audio streams. "
f"Only the first stream (format_id: {formats_to_download[0].get('format_id')}) will be downloaded. "
f"To download all streams, please specify their format IDs separately."
)
target_format = formats_to_download[0]
# Get file size for auto-timeout and dynamic options # Get file size for auto-timeout and dynamic options
total_filesize = target_format.get('filesize') or target_format.get('filesize_approx') total_filesize = target_format.get('filesize') or target_format.get('filesize_approx')
@ -231,9 +265,9 @@ def main_download_aria(args):
# Prepare options for aria2 # Prepare options for aria2
aria_options = { aria_options = {
# Options from yt-dlp's aria2c integration for performance and reliability # Options from yt-dlp's aria2c integration for performance and reliability
'continue': 'true',
'max-connection-per-server': 16, 'max-connection-per-server': 16,
'split': 16, 'split': 16,
'min-split-size': '1M',
'http-accept-gzip': 'true', 'http-accept-gzip': 'true',
'file-allocation': 'none', 'file-allocation': 'none',
} }
@ -243,20 +277,59 @@ def main_download_aria(args):
custom_options = parse_aria_args_to_options(args.downloader_args) custom_options = parse_aria_args_to_options(args.downloader_args)
# Dynamically set min-split-size if not overridden by user # Set min-split-size. yt-dlp's default is 1M.
if 'min_split_size' not in custom_options and total_filesize: if 'min-split-size' not in custom_options:
if total_filesize > 100 * 1024 * 1024: # 100 MiB if total_filesize and total_filesize > 100 * 1024 * 1024: # 100 MiB
aria_options['min-split-size'] = '5M' aria_options['min-split-size'] = '5M'
logger.info("File is > 100MiB, dynamically setting min-split-size to 5M.") logger.info("File is > 100MiB, dynamically setting min-split-size to 5M.")
else:
aria_options['min-split-size'] = '1M'
if custom_options: if custom_options:
aria_options.update(custom_options) aria_options.update(custom_options)
logger.info(f"Applied custom aria2c options from --downloader-args: {custom_options}") logger.info(f"Applied custom aria2c options from --downloader-args: {custom_options}")
# For older aria2c versions, SOCKS5 proxy must be specified with an 'http://' scheme.
if 'all-proxy' in aria_options and isinstance(aria_options['all-proxy'], str) and aria_options['all-proxy'].startswith('socks5://'):
proxy_url = aria_options['all-proxy']
logger.info("Replacing 'socks5://' with 'http://' in proxy URL for aria2c compatibility.")
aria_options['all-proxy'] = 'http://' + proxy_url[len('socks5://'):]
aria_options['out'] = filename aria_options['out'] = filename
# Add headers from info.json, mimicking yt-dlp's behavior for aria2c # Add headers from info.json, and allow overriding/adding with --add-header
headers = target_format.get('http_headers') headers = target_format.get('http_headers', {}).copy()
if args.add_header:
for header in args.add_header:
if ':' not in header:
logger.error(f"Invalid header format in --add-header: '{header}'. Expected 'Key: Value'.")
return 1
key, value = header.split(':', 1)
key = key.strip()
value = value.strip()
if key in headers:
logger.info(f"Overwriting header '{key}' from info.json with value from command line.")
else:
logger.info(f"Adding header from command line: {key}: {value}")
headers[key] = value
# Enforce a consistent User-Agent.
# First, remove any User-Agent that might have come from info.json, case-insensitively.
for key in list(headers.keys()):
if key.lower() == 'user-agent':
del headers[key]
# Set the default Cobalt User-Agent.
default_user_agent = 'Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version'
headers['User-Agent'] = default_user_agent
logger.info(f"Set default User-Agent to: {default_user_agent}")
# The --user-agent flag has the highest precedence and can override the default.
if args.user_agent:
headers['User-Agent'] = args.user_agent
logger.info(f"Overriding User-Agent with value from --user-agent: {args.user_agent}")
if headers: if headers:
header_list = [f'{key}: {value}' for key, value in headers.items()] header_list = [f'{key}: {value}' for key, value in headers.items()]
aria_options['header'] = header_list aria_options['header'] = header_list
@ -268,6 +341,12 @@ def main_download_aria(args):
else: else:
logger.debug(f" Header: {h}") logger.debug(f" Header: {h}")
# Final check: ensure all option values are strings, as required by aria2c RPC.
# The 'header' option is a list of strings, which is a special case and should be preserved.
for key, value in aria_options.items():
if key != 'header' and not isinstance(value, str):
aria_options[key] = str(value)
is_fragmented = 'fragments' in target_format is_fragmented = 'fragments' in target_format
if not is_fragmented: if not is_fragmented:
url = target_format.get('url') url = target_format.get('url')
@ -305,10 +384,20 @@ def main_download_aria(args):
logger.error(f"Invalid --wait-timeout value: '{args.wait_timeout}'. Must be a positive integer or 'auto'.") logger.error(f"Invalid --wait-timeout value: '{args.wait_timeout}'. Must be a positive integer or 'auto'.")
return 1 return 1
# Determine the download directory for aria2c.
# If --remote-dir is specified, it takes precedence.
# Otherwise, assume a local setup and use --output-dir.
# It's crucial to use an absolute path to avoid ambiguity for the aria2c daemon.
download_dir_for_aria = args.remote_dir
if not download_dir_for_aria:
local_dir = args.output_dir or '.'
download_dir_for_aria = os.path.abspath(local_dir)
logger.info(f"No --remote-dir specified. Using local path for aria2c download directory: {download_dir_for_aria}")
if is_fragmented: if is_fragmented:
return download_fragments_aria(args, api, target_format, filename, aria_options, timeout_seconds, remote_dir=args.remote_dir) return download_fragments_aria(args, api, target_format, filename, aria_options, timeout_seconds, remote_dir=download_dir_for_aria)
else: else:
return download_url_aria(args, api, url, filename, aria_options, timeout_seconds, remote_dir=args.remote_dir) return download_url_aria(args, api, url, filename, aria_options, timeout_seconds, remote_dir=download_dir_for_aria)
except Exception as e: except Exception as e:
logger.error(f"An error occurred while communicating with aria2c: {e}", exc_info=args.verbose) logger.error(f"An error occurred while communicating with aria2c: {e}", exc_info=args.verbose)
@ -325,87 +414,98 @@ def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, r
logger.error("Failed to add download to aria2c. The API returned an empty result.") logger.error("Failed to add download to aria2c. The API returned an empty result.")
return 1 return 1
# Handle older aria2p versions that return a single Download object instead of a list
download = downloads[0] if isinstance(downloads, list) else downloads download = downloads[0] if isinstance(downloads, list) else downloads
logger.info(f"Successfully added download to aria2c. GID: {download.gid}") logger.info(f"Successfully added download to aria2c. GID: {download.gid}")
if args.wait: if args.wait:
logger.info(f"Waiting for download {download.gid} to complete...") logger.info(f"Waiting for download {download.gid} to complete using WebSocket events...")
start_time = time.time() download_finished_event = threading.Event()
final_status = {}
def on_complete(api_ref, event_gid):
if event_gid == download.gid:
logger.debug(f"WebSocket: GID {event_gid} completed.")
final_status['status'] = 'complete'
download_finished_event.set()
def on_error(api_ref, event_gid):
if event_gid == download.gid:
logger.debug(f"WebSocket: GID {event_gid} errored.")
final_status['status'] = 'error'
download_finished_event.set()
def on_stop(api_ref, event_gid):
if event_gid == download.gid:
logger.debug(f"WebSocket: GID {event_gid} stopped.")
final_status['status'] = 'stopped'
download_finished_event.set()
listener_thread = threading.Thread(
target=api.listen_to_notifications,
kwargs={
'on_download_complete': on_complete,
'on_download_error': on_error,
'on_download_stop': on_stop,
'timeout': 1,
'handle_signals': False
},
daemon=True
)
try: try:
while True: listener_thread.start()
if timeout_seconds and (time.time() - start_time > timeout_seconds): finished = download_finished_event.wait(timeout=timeout_seconds)
raise TimeoutError(f"Download did not complete within {timeout_seconds}s timeout.") if not finished and not download_finished_event.is_set():
raise TimeoutError(f"Download did not complete within {timeout_seconds}s timeout.")
# Re-fetch the download object to get the latest status except KeyboardInterrupt:
download.update()
# A download is no longer active if it's complete, errored, paused, or removed.
if download.status not in ('active', 'waiting'):
break
progress_info = (
f"\rGID {download.gid}: {download.status} "
f"{download.progress_string()} "
f"({download.download_speed_string()}) "
f"ETA: {download.eta_string()}"
)
sys.stdout.write(progress_info)
sys.stdout.flush()
time.sleep(0.5)
except (KeyboardInterrupt, TimeoutError) as e:
sys.stdout.write('\n') sys.stdout.write('\n')
if isinstance(e, KeyboardInterrupt): logger.warning("Wait interrupted by user. Cleaning up download...")
logger.warning("Wait interrupted by user. Cleaning up download...") cleanup_aria_download(api, [download])
cleanup_aria_download(api, [download]) return 130
return 130 except TimeoutError as e:
else: # TimeoutError logger.error(f"Download timed out. Cleaning up... Error: {e}")
logger.error(f"Download timed out. Cleaning up... Error: {e}") cleanup_aria_download(api, [download])
cleanup_aria_download(api, [download]) return 1
return 1 finally:
api.stop_listening()
if listener_thread.is_alive():
listener_thread.join(timeout=2)
# Re-fetch download object to get final details
try:
download.update()
except aria2p.ClientException as e: except aria2p.ClientException as e:
# This can happen if the download completes and is removed by aria2c logger.warning(f"Could not update final status for GID {download.gid} (maybe removed on completion?): {e}.")
# before we can check its final status. Assume success in this case. if final_status.get('status') != 'complete':
logger.warning(f"Could not get final status for GID {download.gid} (maybe removed on completion?): {e}. Assuming success.") logger.error(f"Download {download.gid} failed, but could not retrieve final error details.")
print(f"Download for GID {download.gid} presumed successful.") return 1
return 0
sys.stdout.write('\n') # Newline after progress bar if final_status.get('status') == 'complete':
# Final status check (no need to update again, we have the latest status)
if download.status == 'complete':
logger.info(f"Download {download.gid} completed successfully.") logger.info(f"Download {download.gid} completed successfully.")
downloaded_filepath_remote = download.files[0].path if download.files else None
downloaded_filepath_remote = None if downloaded_filepath_remote:
if download.files:
downloaded_filepath_remote = download.files[0].path
print(f"Download successful: {downloaded_filepath_remote}") print(f"Download successful: {downloaded_filepath_remote}")
else: else:
print("Download successful, but no file path reported by aria2c.") print("Download successful, but no file path reported by aria2c.")
if args.cleanup and downloaded_filepath_remote: if args.cleanup and downloaded_filepath_remote:
local_filepath = None
# To map remote path to local, we need remote_dir and a local equivalent.
# We'll use fragments_dir as the local equivalent, which defaults to output_dir.
local_base_dir = args.fragments_dir or args.output_dir or '.' local_base_dir = args.fragments_dir or args.output_dir or '.'
if remote_dir: if remote_dir and downloaded_filepath_remote.startswith(remote_dir):
if downloaded_filepath_remote.startswith(remote_dir): relative_path = os.path.relpath(downloaded_filepath_remote, remote_dir)
relative_path = os.path.relpath(downloaded_filepath_remote, remote_dir) local_filepath = os.path.join(local_base_dir, relative_path)
local_filepath = os.path.join(local_base_dir, relative_path)
else:
logger.warning(f"Cleanup: Downloaded file path '{downloaded_filepath_remote}' does not start with remote-dir '{remote_dir}'. Cannot map to local path.")
else: else:
logger.warning(f"Cleanup: --remote-dir not specified. Assuming download path is accessible locally as '{downloaded_filepath_remote}'.")
local_filepath = downloaded_filepath_remote local_filepath = downloaded_filepath_remote
if not remote_dir:
logger.warning(f"Cleanup: --remote-dir not specified. Assuming download path is accessible locally as '{local_filepath}'.")
if local_filepath: try:
try: if os.path.exists(local_filepath):
if os.path.exists(local_filepath): os.remove(local_filepath)
os.remove(local_filepath) logger.info(f"Cleanup: Removed downloaded file '{local_filepath}'")
logger.info(f"Cleanup: Removed downloaded file '{local_filepath}'") else:
else: logger.warning(f"Cleanup: File not found at expected local path '{local_filepath}'. Skipping removal.")
logger.warning(f"Cleanup: File not found at expected local path '{local_filepath}'. Skipping removal.") except OSError as e:
except OSError as e: logger.error(f"Cleanup failed: Could not remove file '{local_filepath}': {e}")
logger.error(f"Cleanup failed: Could not remove file '{local_filepath}': {e}")
elif args.cleanup: elif args.cleanup:
logger.warning("Cleanup requested, but no downloaded file path was reported by aria2c.") logger.warning("Cleanup requested, but no downloaded file path was reported by aria2c.")
@ -417,11 +517,10 @@ def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, r
logger.warning(f"Failed to purge download history: {e}") logger.warning(f"Failed to purge download history: {e}")
elif args.remove_on_complete: elif args.remove_on_complete:
try: try:
api.remove_download_result(download) api.client.remove_download_result(download.gid)
logger.info(f"Removed download {download.gid} from aria2c history.") logger.info(f"Removed download {download.gid} from aria2c history.")
except Exception as e: except Exception as e:
logger.warning(f"Failed to remove download {download.gid} from history: {e}") logger.warning(f"Failed to remove download {download.gid} from history: {e}")
return 0 return 0
else: else:
detailed_error = parse_aria_error(download) detailed_error = parse_aria_error(download)
@ -445,243 +544,236 @@ def download_fragments_aria(args, api, target_format, filename, aria_options, ti
) )
return 1 return 1
# We need to set the 'dir' option for all fragments if specified.
# The 'out' option will be set per-fragment.
frag_aria_options = aria_options.copy() frag_aria_options = aria_options.copy()
frag_aria_options.pop('out', None) # Remove the main 'out' option frag_aria_options.pop('out', None)
if remote_dir: if remote_dir:
frag_aria_options['dir'] = remote_dir frag_aria_options['dir'] = remote_dir
logger.info(f"Instructing remote aria2c to save fragments to: {remote_dir}") logger.info(f"Instructing remote aria2c to save fragments to: {remote_dir}")
base_filename, file_ext = os.path.splitext(filename) base_filename, file_ext = os.path.splitext(filename)
logger.info(f"Preparing {len(fragments)} fragments for a batch submission to aria2c...")
calls = [] multicall_payload = []
for i, fragment in enumerate(fragments): for i, fragment in enumerate(fragments):
frag_url = fragment.get('url') frag_url = fragment.get('url') or urljoin(fragment_base_url, fragment['path'])
if not frag_url: if not frag_url:
if not fragment_base_url: logger.error(f"Fragment {i} has no URL and no fragment_base_url is available. Aborting.")
logger.error(f"Fragment {i} has no URL and no fragment_base_url is available. Aborting.") return 1
return 1
frag_url = urljoin(fragment_base_url, fragment['path'])
# Use the base filename from the main file, but add fragment identifier
fragment_filename = f"{base_filename}-Frag{i}{file_ext}" fragment_filename = f"{base_filename}-Frag{i}{file_ext}"
current_frag_options = frag_aria_options.copy() current_frag_options = frag_aria_options.copy()
current_frag_options['out'] = os.path.basename(fragment_filename) current_frag_options['out'] = os.path.basename(fragment_filename)
# Prepare parameters for multicall in the format: # The aria2p library will handle adding the secret token to each call in the multicall.
# {"methodName": "aria2.addUri", "params": [["url"], {"out": "file.mp4"}]}
# The secret token is automatically added by aria2p.
params = [[frag_url], current_frag_options] params = [[frag_url], current_frag_options]
call_struct = { multicall_payload.append({'methodName': 'aria2.addUri', 'params': params})
"methodName": api.client.ADD_URI,
"params": params
}
calls.append(call_struct)
results = api.client.multicall(calls) if not args.wait:
if not results: # Asynchronous mode: submit all fragments at once and exit.
logger.error("Failed to add fragments to aria2c. The API returned an empty result.") gids, failed_count = [], 0
return 1
# The result of a multicall of addUri is a list of lists, where each inner list
# contains the GID of one download, e.g., [['gid1'], ['gid2']].
# A failed call for a fragment may result in a fault struct dict instead of a list.
# We extract GIDs from successful calls.
gids = [result[0] for result in results if isinstance(result, list) and result]
if len(gids) != len(fragments):
failed_count = len(fragments) - len(gids)
logger.warning(f"{failed_count} out of {len(fragments)} fragments failed to be added to aria2c.")
if not gids:
logger.error("Failed to add any fragments to aria2c. All submissions failed.")
return 1
logger.info(f"Successfully added {len(gids)} fragments to aria2c.")
if args.verbose:
logger.debug(f"GIDs: {gids}")
if args.wait:
logger.info(f"Waiting for {len(gids)} fragments to complete...")
start_time = time.time()
downloads_to_cleanup = []
try: try:
while True: logger.info(f"Submitting {len(multicall_payload)} fragments to aria2c in a single batch request...")
if timeout_seconds and (time.time() - start_time > timeout_seconds): # The aria2p client library correctly handles authentication for multicalls.
raise TimeoutError(f"Fragment downloads did not complete within {timeout_seconds}s timeout.") results = api.client.multicall(multicall_payload)
for i, result in enumerate(results):
if isinstance(result, list) and len(result) == 1 and isinstance(result[0], str):
gids.append(result[0])
else:
failed_count += 1
logger.warning(f"Failed to add fragment {i + 1}: {result[0] if isinstance(result, list) else result}")
except Exception as e:
logger.error(f"Batch submission to aria2c failed: {e}", exc_info=args.verbose)
return 1
if failed_count > 0:
logger.warning(f"{failed_count} out of {len(fragments)} fragments failed to be added to aria2c.")
if not gids:
logger.error("Failed to add any fragments to aria2c. All submissions failed.")
return 1
print(f"Successfully added {len(gids)} fragments. GIDs: {gids}\nThese fragments will need to be merged manually after download.")
return 0
downloads = api.get_downloads(gids) # Synchronous (--wait) mode with WebSockets
downloads_to_cleanup = downloads # Store for potential cleanup MAX_CONCURRENT_FRAGMENTS = args.max_concurrent_fragments
# A download is considered "active" if it's currently downloading or waiting in the queue. all_gids, failed_submission_count = [], 0
# It is "not active" if it is complete, errored, paused, or removed. submitted_gids, completed_gids = set(), set()
active_downloads = [d for d in downloads if d.status in ('active', 'waiting')] lock = threading.Lock()
if not active_downloads: pending_fragments = list(enumerate(multicall_payload))
break # All downloads are complete or have stopped for other reasons total_fragment_count = len(pending_fragments)
logger.info(f"Waiting for {total_fragment_count} fragments to complete using WebSocket events...")
logger.info(f"Will maintain up to {MAX_CONCURRENT_FRAGMENTS} active fragment downloads.")
for d in active_downloads: def on_event(api_ref, event_gid):
d.update() with lock:
if event_gid in submitted_gids:
completed_gids.add(event_gid)
completed_count = len(downloads) - len(active_downloads) listener_thread = threading.Thread(
total_bytes = sum(d.total_length for d in downloads) target=api.listen_to_notifications,
downloaded_bytes = sum(d.completed_length for d in downloads) kwargs={'on_download_complete': on_event, 'on_download_error': on_event, 'on_download_stop': on_event, 'timeout': 1, 'handle_signals': False},
total_speed = sum(d.download_speed for d in downloads) daemon=True
progress_percent = (downloaded_bytes / total_bytes * 100) if total_bytes > 0 else 0 )
listener_thread.start()
start_time = time.time()
progress_info = ( try:
f"\rProgress: {completed_count}/{len(downloads)} fragments | " while True:
f"{progress_percent:.1f}% " with lock:
f"({human_readable_bytes(downloaded_bytes)}/{human_readable_bytes(total_bytes)}) " if len(completed_gids) >= total_fragment_count:
f"Speed: {human_readable_bytes(total_speed)}/s" break
) if timeout_seconds and (time.time() - start_time > timeout_seconds):
sys.stdout.write(progress_info) raise TimeoutError(f"Fragment downloads did not complete within {timeout_seconds}s timeout.")
sys.stdout.flush()
time.sleep(0.5)
except (KeyboardInterrupt, TimeoutError) as e:
sys.stdout.write('\n')
if isinstance(e, KeyboardInterrupt):
logger.warning("Wait interrupted by user. Cleaning up fragments...")
cleanup_aria_download(api, downloads_to_cleanup)
return 130
else: # TimeoutError
logger.error(f"Download timed out. Cleaning up fragments... Error: {e}")
cleanup_aria_download(api, downloads_to_cleanup)
return 1
except aria2p.ClientException as e:
# This can happen if downloads complete and are removed by aria2c
# before we can check their final status. Assume success in this case.
logger.warning(f"Could not get final status for some fragments (maybe removed on completion?): {e}. Assuming success.")
with lock:
active_gids_count = len(submitted_gids) - len(completed_gids)
num_to_submit = MAX_CONCURRENT_FRAGMENTS - active_gids_count
if num_to_submit > 0 and pending_fragments:
chunk_to_submit = pending_fragments[:num_to_submit]
pending_fragments = pending_fragments[num_to_submit:]
indices = [item[0] for item in chunk_to_submit]
payloads = [item[1] for item in chunk_to_submit]
try:
# The aria2p client library correctly handles authentication for multicalls.
results = api.client.multicall(payloads)
with lock:
for i, result in enumerate(results):
original_index = indices[i]
if isinstance(result, list) and len(result) == 1 and isinstance(result[0], str):
gid = result[0]
all_gids.append(gid)
submitted_gids.add(gid)
else:
failed_submission_count += 1
completed_gids.add(f"failed-submission-{original_index}")
logger.warning(f"Failed to add fragment {original_index + 1}: {result[0] if isinstance(result, list) else result}")
except Exception as e:
logger.error(f"Batch submission to aria2c failed for a chunk: {e}", exc_info=args.verbose)
with lock:
for i in indices:
failed_submission_count += 1
completed_gids.add(f"failed-submission-{i}")
with lock:
completed_download_count = len(completed_gids)
progress_percent = (completed_download_count / total_fragment_count * 100) if total_fragment_count > 0 else 0
sys.stdout.write(f"\rProgress: {completed_download_count}/{total_fragment_count} fragments | {progress_percent:.1f}%")
sys.stdout.flush()
time.sleep(0.5)
except (KeyboardInterrupt, TimeoutError) as e:
sys.stdout.write('\n') sys.stdout.write('\n')
if isinstance(e, KeyboardInterrupt):
logger.warning("Wait interrupted by user. Cleaning up fragments...")
else:
logger.error(f"Download timed out. Cleaning up fragments... Error: {e}")
cleanup_aria_download(api, api.get_downloads(list(submitted_gids)))
return 130 if isinstance(e, KeyboardInterrupt) else 1
finally:
api.stop_listening()
if listener_thread.is_alive():
listener_thread.join(timeout=2)
# Final status check sys.stdout.write('\n')
failed_downloads = [] if failed_submission_count > 0:
logger.error(f"{failed_submission_count} fragments failed to be submitted to aria2c.")
final_downloads = []
if all_gids:
try: try:
downloads = api.get_downloads(gids) final_downloads = api.get_downloads(all_gids)
failed_downloads = [d for d in downloads if d.status != 'complete']
except aria2p.ClientException as e: except aria2p.ClientException as e:
logger.warning(f"Could not perform final status check for fragments (maybe removed on completion?): {e}. Assuming success.") logger.warning(f"Could not perform final status check for fragments (maybe removed on completion?): {e}. Assuming success.")
# If we can't check, we assume success based on the earlier wait loop not failing catastrophically.
failed_downloads = []
if failed_downloads: failed_downloads = [d for d in final_downloads if d.status != 'complete']
logger.error(f"{len(failed_downloads)} fragments failed to download.") if failed_downloads:
for d in failed_downloads: logger.error(f"{len(failed_downloads)} fragments failed to download.")
detailed_error = parse_aria_error(d) for d in failed_downloads[:5]:
logger.error(f" GID {d.gid}: {detailed_error}") logger.error(f" GID {d.gid}: {parse_aria_error(d)}")
return 1 if len(failed_downloads) > 5:
else: logger.error(f" ... and {len(failed_downloads) - 5} more errors.")
logger.info("All fragments downloaded successfully.") return 1
output_dir = args.output_dir or '.' if failed_submission_count > 0:
final_filepath = os.path.join(output_dir, filename) logger.error("Aborting due to fragment submission failures.")
fragments_lookup_dir = args.fragments_dir or output_dir return 1
if args.auto_merge_fragments: logger.info("All fragments downloaded successfully.")
logger.info(f"Attempting to merge fragments into: {final_filepath}") output_dir = args.output_dir or '.'
logger.info(f"Searching for fragments in local directory: {os.path.abspath(fragments_lookup_dir)}") final_filepath = os.path.join(output_dir, filename)
fragments_lookup_dir = args.fragments_dir or output_dir
if args.auto_merge_fragments:
logger.info(f"Attempting to merge fragments into: {final_filepath}")
logger.info(f"Searching for fragments in local directory: {os.path.abspath(fragments_lookup_dir)}")
try:
escaped_base = glob.escape(base_filename)
search_path = os.path.join(fragments_lookup_dir, f"{escaped_base}-Frag*{file_ext}")
fragment_files = sorted(glob.glob(search_path), key=lambda f: int(re.search(r'Frag(\d+)', os.path.basename(f)).group(1)))
if not fragment_files:
logger.error(f"No fragment files found with pattern: {search_path}")
return 1
with open(final_filepath, 'wb') as dest_file:
for frag_path in fragment_files:
with open(frag_path, 'rb') as src_file:
shutil.copyfileobj(src_file, dest_file)
logger.info(f"Successfully merged {len(fragment_files)} fragments into {final_filepath}")
if args.remove_fragments_after_merge or args.cleanup:
logger.info("Removing fragment files...")
for frag_path in fragment_files: os.remove(frag_path)
logger.info("Fragment files removed.")
if args.cleanup:
try: try:
# base_filename and file_ext are available from earlier in the function os.remove(final_filepath)
# We must escape the base filename in case it contains glob special characters like [ or ]. logger.info(f"Cleanup: Removed merged file '{final_filepath}'")
escaped_base = glob.escape(base_filename) except OSError as e:
search_path = os.path.join(fragments_lookup_dir, f"{escaped_base}-Frag*{file_ext}") logger.error(f"Cleanup failed: Could not remove merged file '{final_filepath}': {e}")
fragment_files = glob.glob(search_path)
if not fragment_files: print(f"Download and merge successful: {final_filepath}")
logger.error(f"No fragment files found with pattern: {search_path}")
return 1
def fragment_sort_key(f):
match = re.search(r'Frag(\d+)', os.path.basename(f))
return int(match.group(1)) if match else -1
fragment_files.sort(key=fragment_sort_key)
with open(final_filepath, 'wb') as dest_file:
for frag_path in fragment_files:
with open(frag_path, 'rb') as src_file:
shutil.copyfileobj(src_file, dest_file)
logger.info(f"Successfully merged {len(fragment_files)} fragments into {final_filepath}")
if args.remove_fragments_after_merge or args.cleanup:
logger.info("Removing fragment files...")
for frag_path in fragment_files:
os.remove(frag_path)
logger.info("Fragment files removed.")
if args.cleanup:
try:
os.remove(final_filepath)
logger.info(f"Cleanup: Removed merged file '{final_filepath}'")
except OSError as e:
logger.error(f"Cleanup failed: Could not remove merged file '{final_filepath}': {e}")
print(f"Download and merge successful: {final_filepath}")
if args.purge_on_complete:
try:
api.purge_download_result()
logger.info("Purged all completed/failed downloads from aria2c history.")
except Exception as e:
logger.warning(f"Failed to purge download history: {e}")
elif args.remove_on_complete:
try:
# The `downloads` variable from the last status check should be valid here.
api.remove_download_result(downloads)
logger.info(f"Removed {len(downloads)} fragment downloads from aria2c history.")
except aria2p.ClientException as e:
logger.warning(f"Could not remove fragment downloads from history (maybe already gone?): {e}")
except Exception as e:
logger.warning(f"Failed to remove fragment downloads from history: {e}")
return 0
if args.purge_on_complete:
try:
api.purge_download_result()
logger.info("Purged all completed/failed downloads from aria2c history.")
except Exception as e: except Exception as e:
logger.error(f"An error occurred during merging: {e}", exc_info=args.verbose) logger.warning(f"Failed to purge download history: {e}")
logger.error("Fragments were downloaded but not merged.") elif args.remove_on_complete:
return 1 try:
else: for d in final_downloads:
print("Download successful. Fragments now need to be merged manually.") try: api.client.remove_download_result(d.gid)
print(f"The final merged file should be named: {final_filepath}") except aria2p.ClientException: pass
print("You can merge them with a command like:") logger.info(f"Removed {len(final_downloads)} fragment downloads from aria2c history.")
print(f" cat `ls -v '{os.path.join(fragments_lookup_dir, base_filename)}'-Frag*'{file_ext}'` > '{final_filepath}'") except Exception as e:
logger.warning(f"Failed to remove fragment downloads from history: {e}")
if args.cleanup: return 0
logger.info("Cleanup requested. Removing downloaded fragments...") except Exception as e:
try: logger.error(f"An error occurred during merging: {e}", exc_info=args.verbose)
# base_filename and file_ext are available from earlier in the function logger.error("Fragments were downloaded but not merged.")
escaped_base = glob.escape(base_filename) return 1
search_path = os.path.join(fragments_lookup_dir, f"{escaped_base}-Frag*{file_ext}")
fragment_files = glob.glob(search_path)
if not fragment_files:
logger.warning(f"Cleanup: No fragment files found with pattern: {search_path}")
else:
for frag_path in fragment_files:
os.remove(frag_path)
logger.info(f"Removed {len(fragment_files)} fragment files.")
except Exception as e:
logger.error(f"An error occurred during fragment cleanup: {e}", exc_info=args.verbose)
if args.purge_on_complete:
try:
api.purge_download_result()
logger.info("Purged all completed/failed downloads from aria2c history.")
except Exception as e:
logger.warning(f"Failed to purge download history: {e}")
elif args.remove_on_complete:
try:
# The `downloads` variable from the last status check should be valid here.
api.remove_download_result(downloads)
logger.info(f"Removed {len(downloads)} fragment downloads from aria2c history.")
except aria2p.ClientException as e:
logger.warning(f"Could not remove fragment downloads from history (maybe already gone?): {e}")
except Exception as e:
logger.warning(f"Failed to remove fragment downloads from history: {e}")
return 0
else: else:
print(f"Successfully added {len(gids)} fragments. GIDs: {gids}") print(f"Download successful. Fragments now need to be merged manually.\nThe final merged file should be named: {final_filepath}")
print("These fragments will need to be merged manually after download.") print(f"You can merge them with a command like:\n cat `ls -v '{os.path.join(fragments_lookup_dir, base_filename)}'-Frag*'{file_ext}'` > '{final_filepath}'")
if args.cleanup:
logger.info("Cleanup requested. Removing downloaded fragments...")
try:
escaped_base = glob.escape(base_filename)
search_path = os.path.join(fragments_lookup_dir, f"{escaped_base}-Frag*{file_ext}")
fragment_files = glob.glob(search_path)
if not fragment_files:
logger.warning(f"Cleanup: No fragment files found with pattern: {search_path}")
else:
for frag_path in fragment_files: os.remove(frag_path)
logger.info(f"Removed {len(fragment_files)} fragment files.")
except Exception as e:
logger.error(f"An error occurred during fragment cleanup: {e}", exc_info=args.verbose)
if args.purge_on_complete:
try:
api.purge_download_result()
logger.info("Purged all completed/failed downloads from aria2c history.")
except Exception as e:
logger.warning(f"Failed to purge download history: {e}")
elif args.remove_on_complete:
try:
api.remove_download_result(final_downloads)
logger.info(f"Removed {len(final_downloads)} fragment downloads from aria2c history.")
except Exception as e:
logger.warning(f"Could not remove fragment downloads from history (maybe already gone?): {e}")
return 0 return 0

View File

@ -84,12 +84,19 @@ def add_download_native_py_parser(subparsers):
parser.add_argument('--output-buffer', action='store_true', help='Download to an in-memory buffer and print raw bytes to stdout. Final filename is printed to stderr.') parser.add_argument('--output-buffer', action='store_true', help='Download to an in-memory buffer and print raw bytes to stdout. Final filename is printed to stderr.')
parser.add_argument('--cleanup', action='store_true', help='After download, rename the file to include a timestamp and truncate it to 0 bytes.') parser.add_argument('--cleanup', action='store_true', help='After download, rename the file to include a timestamp and truncate it to 0 bytes.')
parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.') parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.')
parser.add_argument('--retries', type=int, help='Number of retries for the entire download (default: 10).')
parser.add_argument('--fragment-retries', type=int, help='Number of retries for each fragment (default: 10).')
parser.add_argument('--socket-timeout', type=int, help='Timeout for socket operations in seconds (default: 20).')
parser.add_argument('--add-header', action='append', help='Add a custom HTTP header for the download. Format: "Key: Value". Can be used multiple times.')
# Arguments to pass through to yt-dlp
parser.add_argument('--download-sections', help='yt-dlp --download-sections argument (e.g., "*0-10240").')
parser.add_argument('--test', action='store_true', help='yt-dlp --test argument (download small part).')
return parser return parser
def main_download_native_py(args): def main_download_native_py(args):
"""Main logic for the 'download-native-py' command.""" """Main logic for the 'download-native-py' command."""
# If outputting to buffer, all logging must go to stderr to keep stdout clean for binary data. # All logging should go to stderr to keep stdout clean for the final filename, or for binary data with --output-buffer.
log_stream = sys.stderr if args.output_buffer else sys.stdout log_stream = sys.stderr
log_level = logging.DEBUG if args.verbose else logging.INFO log_level = logging.DEBUG if args.verbose else logging.INFO
# Reconfigure root logger # Reconfigure root logger
for handler in logging.root.handlers[:]: for handler in logging.root.handlers[:]:
@ -176,7 +183,10 @@ def main_download_native_py(args):
logger.info(f"Adding {len(extra_args_list)} extra arguments from --extra-ytdlp-args.") logger.info(f"Adding {len(extra_args_list)} extra arguments from --extra-ytdlp-args.")
base_opts_args.extend(extra_args_list) base_opts_args.extend(extra_args_list)
ydl_opts = {} ydl_opts = {
'noresizebuffer': True,
'buffersize': '4M',
}
if base_opts_args: if base_opts_args:
try: try:
logger.info(f"Parsing {len(base_opts_args)} arguments from config/extra_args...") logger.info(f"Parsing {len(base_opts_args)} arguments from config/extra_args...")
@ -192,6 +202,17 @@ def main_download_native_py(args):
# Handle flags (no value) # Handle flags (no value)
is_flag = i + 1 >= len(base_opts_args) or base_opts_args[i + 1].startswith('--') is_flag = i + 1 >= len(base_opts_args) or base_opts_args[i + 1].startswith('--')
if key == 'resize_buffer':
ydl_opts['noresizebuffer'] = False
logger.debug(f"Parsed flag: noresizebuffer = False")
i += 1
continue
elif key == 'no_resize_buffer':
ydl_opts['noresizebuffer'] = True
logger.debug(f"Parsed flag: noresizebuffer = True")
i += 1
continue
if is_flag: if is_flag:
if key.startswith('no_'): if key.startswith('no_'):
@ -229,6 +250,8 @@ def main_download_native_py(args):
# Special handling for keys that differ from CLI arg, e.g. --limit-rate -> ratelimit # Special handling for keys that differ from CLI arg, e.g. --limit-rate -> ratelimit
if key == 'limit_rate': if key == 'limit_rate':
key = 'ratelimit' key = 'ratelimit'
elif key == 'buffer_size':
key = 'buffersize'
ydl_opts[key] = value ydl_opts[key] = value
logger.debug(f"Parsed option: {key} = {value}") logger.debug(f"Parsed option: {key} = {value}")
@ -257,6 +280,21 @@ def main_download_native_py(args):
ydl_opts['paths'] = {'temp': args.temp_path} ydl_opts['paths'] = {'temp': args.temp_path}
logger.info(f"Using temporary path: {args.temp_path}") logger.info(f"Using temporary path: {args.temp_path}")
if args.add_header:
if 'http_headers' not in ydl_opts:
ydl_opts['http_headers'] = {}
elif not isinstance(ydl_opts['http_headers'], dict):
logger.warning(f"Overwriting non-dictionary http_headers from config with headers from command line.")
ydl_opts['http_headers'] = {}
for header in args.add_header:
if ':' not in header:
logger.error(f"Invalid header format in --add-header: '{header}'. Expected 'Key: Value'.")
return 1
key, value = header.split(':', 1)
ydl_opts['http_headers'][key.strip()] = value.strip()
logger.info(f"Adding/overwriting header: {key.strip()}: {value.strip()}")
if args.download_continue: if args.download_continue:
ydl_opts['continuedl'] = True ydl_opts['continuedl'] = True
ydl_opts['nooverwrites'] = True ydl_opts['nooverwrites'] = True
@ -279,6 +317,19 @@ def main_download_native_py(args):
if args.merge_output_format: if args.merge_output_format:
ydl_opts['merge_output_format'] = args.merge_output_format ydl_opts['merge_output_format'] = args.merge_output_format
if args.download_sections:
ydl_opts['download_sections'] = args.download_sections
if args.test:
ydl_opts['test'] = True
if args.retries is not None:
ydl_opts['retries'] = args.retries
if args.fragment_retries is not None:
ydl_opts['fragment_retries'] = args.fragment_retries
if args.socket_timeout is not None:
ydl_opts['socket_timeout'] = args.socket_timeout
try: try:
logger.info(f"Starting download for format '{args.format}' using yt-dlp library...") logger.info(f"Starting download for format '{args.format}' using yt-dlp library...")
@ -301,6 +352,13 @@ def main_download_native_py(args):
# The success path is now always taken if no exception was raised. # The success path is now always taken if no exception was raised.
if retcode == 0: if retcode == 0:
if ytdlp_logger.is_403:
logger.error("Download failed: yt-dlp reported HTTP Error 403: Forbidden. The URL has likely expired.")
return 1
if ytdlp_logger.is_timeout:
logger.error("Download failed: yt-dlp reported a timeout.")
return 1
logger.info("yt-dlp download completed successfully.") logger.info("yt-dlp download completed successfully.")
if args.output_buffer: if args.output_buffer:

View File

@ -44,6 +44,14 @@ def add_download_parser(subparsers):
parser.add_argument('--downloader', help='Name of the external downloader to use (e.g., "aria2c", "native").') parser.add_argument('--downloader', help='Name of the external downloader to use (e.g., "aria2c", "native").')
parser.add_argument('--downloader-args', help='Arguments to pass to the external downloader (e.g., "aria2c:-x 8").') parser.add_argument('--downloader-args', help='Arguments to pass to the external downloader (e.g., "aria2c:-x 8").')
parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.') parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.')
parser.add_argument('--retries', help='Number of retries for the entire download (default: 10).')
parser.add_argument('--fragment-retries', help='Number of retries for each fragment (default: 10).')
parser.add_argument('--socket-timeout', help='Timeout for socket operations in seconds (default: 20).')
parser.add_argument('--lang', help='Language code for the request (e.g., "fr", "ja"). Affects metadata language.')
parser.add_argument('--timezone', help='Timezone for the request (e.g., "UTC", "America/New_York"). Note: not supported by yt-dlp.')
# Arguments to pass through to yt-dlp
parser.add_argument('--download-sections', help='yt-dlp --download-sections argument (e.g., "*0-10240").')
parser.add_argument('--test', action='store_true', help='yt-dlp --test argument (download small part).')
return parser return parser
def main_download(args): def main_download(args):
@ -151,6 +159,19 @@ def main_download(args):
if args.merge_output_format: if args.merge_output_format:
cmd.extend(['--merge-output-format', args.merge_output_format]) cmd.extend(['--merge-output-format', args.merge_output_format])
if args.download_sections:
cmd.extend(['--download-sections', args.download_sections])
if args.test:
cmd.append('--test')
if args.retries:
cmd.extend(['--retries', str(args.retries)])
if args.fragment_retries:
cmd.extend(['--fragment-retries', str(args.fragment_retries)])
if args.socket_timeout:
cmd.extend(['--socket-timeout', str(args.socket_timeout)])
if args.download_continue: if args.download_continue:
cmd.extend(['--continue', '--part']) cmd.extend(['--continue', '--part'])
@ -172,6 +193,12 @@ def main_download(args):
if proxy_url: if proxy_url:
cmd.extend(['--proxy', proxy_url]) cmd.extend(['--proxy', proxy_url])
if args.lang:
cmd.extend(['--extractor-args', f'youtube:lang={args.lang}'])
if args.timezone:
logger.warning(f"Timezone override ('{args.timezone}') is not supported by yt-dlp and will be ignored.")
# Determine if we need to capture output. # Determine if we need to capture output.
capture_output = args.cleanup or args.log_file or args.print_traffic capture_output = args.cleanup or args.log_file or args.print_traffic
@ -208,6 +235,16 @@ def main_download(args):
stdout_data, stderr_data = process.communicate() stdout_data, stderr_data = process.communicate()
return_code = process.returncode return_code = process.returncode
# Post-run check for silent failures, like 403 errors where yt-dlp might still exit 0.
if return_code == 0:
output_text = (stdout_data or "") + (stderr_data or "")
if "HTTP Error 403" in output_text:
logger.error("yt-dlp exited successfully, but a 403 error was detected in its output. Forcing failure.")
return_code = 1 # Override success code
elif "timed out" in output_text.lower() or "timeout" in output_text.lower():
logger.error("yt-dlp exited successfully, but a timeout was detected in its output. Forcing failure.")
return_code = 1
# Write captured output to terminal and log file # Write captured output to terminal and log file
if stdout_data: if stdout_data:
sys.stdout.write(stdout_data) sys.stdout.write(stdout_data)

View File

@ -124,7 +124,9 @@ the browser-based generation strategy.''')
parser.add_argument('--direct', action='store_true', help='Use the direct yt-dlp info.json generation method, bypassing Node.js token generation.') parser.add_argument('--direct', action='store_true', help='Use the direct yt-dlp info.json generation method, bypassing Node.js token generation.')
parser.add_argument('--print-info-out', action='store_true', help='Print the final info.json to stdout. By default, output is suppressed unless writing to a file.') parser.add_argument('--print-info-out', action='store_true', help='Print the final info.json to stdout. By default, output is suppressed unless writing to a file.')
parser.add_argument('--request-params-json', help=REQUEST_PARAMS_HELP_STRING + '\nCan also be a comma-separated string of key=value pairs (e.g., "caching_policy.mode=force_refresh").') parser.add_argument('--request-params-json', help=REQUEST_PARAMS_HELP_STRING + '\nCan also be a comma-separated string of key=value pairs (e.g., "caching_policy.mode=force_refresh").')
parser.add_argument('--force-renew', help='Comma-separated list of items to force-renew: cookies, visitor_id, po_token, nsig_cache, all.') parser.add_argument('--force-renew', help='Comma-separated list of items to force-renew: cookies, visitor_id, po_token, nsig_cache, info_json, all.')
parser.add_argument('--lang', help='Language code for the request (e.g., "fr", "ja"). Affects metadata language.')
parser.add_argument('--timezone', help='Timezone for the request (e.g., "UTC", "America/New_York"). Note: experimental, may not be fully supported.')
return parser return parser
def main_get_info(args): def main_get_info(args):
@ -188,6 +190,16 @@ def main_get_info(args):
items_to_renew = [item.strip() for item in args.force_renew.split(',')] items_to_renew = [item.strip() for item in args.force_renew.split(',')]
request_params['force_renew'] = items_to_renew request_params['force_renew'] = items_to_renew
logger.info(f"Requesting force renew for: {items_to_renew}") logger.info(f"Requesting force renew for: {items_to_renew}")
if args.lang:
session_params = request_params.setdefault('session_params', {})
session_params['lang'] = args.lang
logger.info(f"Requesting language: {args.lang}")
if args.timezone:
session_params = request_params.setdefault('session_params', {})
session_params['timeZone'] = args.timezone
logger.info(f"Requesting timezone: {args.timezone}")
if args.verbose: if args.verbose:
# Add verbose flag for yt-dlp on the server # Add verbose flag for yt-dlp on the server
@ -244,6 +256,15 @@ def main_get_info(args):
if not token_data or not hasattr(token_data, 'infoJson') or not token_data.infoJson: if not token_data or not hasattr(token_data, 'infoJson') or not token_data.infoJson:
logger.error("Server did not return valid info.json data.") logger.error("Server did not return valid info.json data.")
if args.verbose:
logger.debug(f"Received token_data from server: {token_data!r}")
if not token_data:
logger.error("Reason: The entire token_data object received from the server is null.")
elif not hasattr(token_data, 'infoJson'):
logger.error("Reason: The received token_data object does not have an 'infoJson' attribute.")
elif not token_data.infoJson:
logger.error("Reason: The 'infoJson' attribute in the received token_data object is empty or null.")
print("Error: Server did not return valid info.json data.", file=sys.stderr) print("Error: Server did not return valid info.json data.", file=sys.stderr)
return 1 return 1

View File

@ -9,6 +9,11 @@ import re
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
from datetime import datetime, timezone from datetime import datetime, timezone
try:
import yt_dlp
except ImportError:
yt_dlp = None
def format_size(b): def format_size(b):
"""Format size in bytes to human-readable string.""" """Format size in bytes to human-readable string."""
if b is None: if b is None:
@ -32,9 +37,39 @@ def list_formats(info_json, requested_formats_str=None, file=sys.stdout):
requested_formats = [] requested_formats = []
requested_order = {} requested_order = {}
if requested_formats_str: if requested_formats_str:
# Split by comma or slash, and filter out empty strings if yt_dlp:
requested_formats = [item for item in re.split(r'[,/]', requested_formats_str) if item] try:
requested_order = {fmt: i for i, fmt in enumerate(requested_formats)} ydl = yt_dlp.YoutubeDL({'quiet': True})
formats = info_json.get('formats', [])
selector = ydl.build_format_selector(requested_formats_str)
ctx = {
'formats': formats,
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats)
or all(f.get('acodec') == 'none' for f in formats)),
}
selected_formats = list(selector(ctx))
all_selected_ids = []
for f in selected_formats:
if 'requested_formats' in f:
all_selected_ids.extend(rf['format_id'] for rf in f['requested_formats'])
else:
all_selected_ids.append(f['format_id'])
requested_formats = all_selected_ids
requested_order = {fmt: i for i, fmt in enumerate(requested_formats)}
except Exception as e:
print(f"WARNING: Could not parse format selector '{requested_formats_str}': {e}", file=sys.stderr)
# Fallback to simple parsing
requested_formats = [item for item in re.split(r'[,/]', requested_formats_str) if item]
requested_order = {fmt: i for i, fmt in enumerate(requested_formats)}
else:
# Fallback to simple parsing if yt-dlp is not installed
print("WARNING: yt-dlp not installed. Using simple format selector parsing.", file=sys.stderr)
requested_formats = [item for item in re.split(r'[,/]', requested_formats_str) if item]
requested_order = {fmt: i for i, fmt in enumerate(requested_formats)}
def sort_key(f): def sort_key(f):
fid = f.get('format_id', '') fid = f.get('format_id', '')

View File

@ -20,6 +20,7 @@ Example of a full configuration JSON showing default values (use single quotes t
"use_curl_prefetch": false, "use_curl_prefetch": false,
"skip_cache": false, "skip_cache": false,
"visitor_id_override_enabled": true, "visitor_id_override_enabled": true,
"webpo_bind_to_visitor_id": true,
"extractor_args": { "extractor_args": {
"youtubepot-bgutilhttp": { "youtubepot-bgutilhttp": {
"base_url": "http://172.17.0.1:4416" "base_url": "http://172.17.0.1:4416"
@ -28,21 +29,22 @@ Example of a full configuration JSON showing default values (use single quotes t
"pot_trace": "true", "pot_trace": "true",
"formats": "duplicate", "formats": "duplicate",
"player_js_version": "actual" "player_js_version": "actual"
},
"youtubepot-webpo": {
"bind_to_visitor_id": "true"
} }
} }
}, },
"_comment_ytdlp_params": "Parameters passed directly to the yt-dlp wrapper for info.json generation.", "_comment_ytdlp_params": "Parameters passed directly to the yt-dlp wrapper for info.json generation.",
"_comment_webpo_bind_to_visitor_id": "If true (default), binds the PO Token cache to the visitor ID. Set to false for TV clients if caching issues occur, as this is not recommended for them.",
"_comment_visitor_id_override_enabled": "If true (default), the server validates the visitor ID from the token generator and creates a new one if it is invalid. Set to false to force using the provided visitor ID without validation, which is useful for debugging.", "_comment_visitor_id_override_enabled": "If true (default), the server validates the visitor ID from the token generator and creates a new one if it is invalid. Set to false to force using the provided visitor ID without validation, which is useful for debugging.",
"_comment_extractor_args": "Directly override yt-dlp extractor arguments. To use BGUtils in script mode, replace 'youtubepot-bgutilhttp' with 'youtubepot-bgutilscript'. The script path is '/opt/bgutil-ytdlp-pot-provider-server/build/generate_once.js'. To disable any explicit provider (like '--bgutils-mode none' on the server), remove both 'youtubepot-bgutilhttp' and 'youtubepot-bgutilscript' keys.", "_comment_extractor_args": "Directly override yt-dlp extractor arguments. To use BGUtils in script mode, replace 'youtubepot-bgutilhttp' with 'youtubepot-bgutilscript'. The script path is '/opt/bgutil-ytdlp-pot-provider-server/build/generate_once.js'. To disable any explicit provider (like '--bgutils-mode none' on the server), remove both 'youtubepot-bgutilhttp' and 'youtubepot-bgutilscript' keys.",
"session_params": { "session_params": {
"lang": "en-US", "lang": "en-US",
"timeZone": "UTC",
"location": "US", "location": "US",
"deviceCategory": "MOBILE", "deviceCategory": "MOBILE",
"user_agent": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)" "user_agent": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)",
"visitor_rotation_threshold": 250
}, },
"_comment_session_params": "Parameters for the token generation session (primarily for Node.js)." "_comment_session_params": "Parameters for the token generation session. `visitor_rotation_threshold` overrides the server's default request limit before a profile's visitor ID is rotated. Set to 0 to disable rotation.",
"_comment_lang_and_tz": "`lang` sets the 'hl' parameter for YouTube's API, affecting metadata language. `timeZone` is intended to set the timezone for requests, but is not fully supported by yt-dlp yet."
}'""" }'"""

View File

@ -148,7 +148,8 @@ def get_profile_from_filename(path, regex_pattern):
class StateManager: class StateManager:
"""Tracks statistics, manages rate limits, and persists state across runs.""" """Tracks statistics, manages rate limits, and persists state across runs."""
def __init__(self, policy_name): def __init__(self, policy_name, disable_log_writing=False):
self.disable_log_writing = disable_log_writing
self.state_file_path = Path(f"{policy_name}_state.json") self.state_file_path = Path(f"{policy_name}_state.json")
self.stats_file_path = Path(f"{policy_name}_stats.jsonl") self.stats_file_path = Path(f"{policy_name}_stats.jsonl")
self.lock = threading.RLock() self.lock = threading.RLock()
@ -174,6 +175,9 @@ class StateManager:
self._open_stats_log() self._open_stats_log()
def _load_state(self): def _load_state(self):
if self.disable_log_writing:
logger.info("Log writing is disabled. State will not be loaded from disk.")
return
if not self.state_file_path.exists(): if not self.state_file_path.exists():
logger.info(f"State file not found at '{self.state_file_path}', starting fresh.") logger.info(f"State file not found at '{self.state_file_path}', starting fresh.")
return return
@ -198,6 +202,8 @@ class StateManager:
logger.error(f"Could not load or parse state file {self.state_file_path}: {e}. Starting fresh.") logger.error(f"Could not load or parse state file {self.state_file_path}: {e}. Starting fresh.")
def _save_state(self): def _save_state(self):
if self.disable_log_writing:
return
with self.lock: with self.lock:
try: try:
with open(self.state_file_path, 'w', encoding='utf-8') as f: with open(self.state_file_path, 'w', encoding='utf-8') as f:
@ -207,6 +213,8 @@ class StateManager:
logger.error(f"Could not save state to {self.state_file_path}: {e}") logger.error(f"Could not save state to {self.state_file_path}: {e}")
def _open_stats_log(self): def _open_stats_log(self):
if self.disable_log_writing:
return
try: try:
self.stats_file_handle = open(self.stats_file_path, 'a', encoding='utf-8') self.stats_file_handle = open(self.stats_file_path, 'a', encoding='utf-8')
except IOError as e: except IOError as e:
@ -737,6 +745,18 @@ class StateManager:
logger.info("Requests per proxy:") logger.info("Requests per proxy:")
for proxy, count in sorted(proxy_counts.items()): for proxy, count in sorted(proxy_counts.items()):
logger.info(f" - {proxy}: {count}") logger.info(f" - {proxy}: {count}")
profile_counts = collections.Counter(e.get('profile') for e in fetch_events if e.get('profile'))
if profile_counts:
logger.info("Requests per profile:")
for profile, count in sorted(profile_counts.items()):
logger.info(f" - {profile}: {count}")
proxy_counts = collections.Counter(e.get('proxy_url') for e in fetch_events if e.get('proxy_url'))
if proxy_counts:
logger.info("Requests per proxy:")
for proxy, count in sorted(proxy_counts.items()):
logger.info(f" - {proxy}: {count}")
if download_events: if download_events:
total_attempts = len(download_events) total_attempts = len(download_events)
@ -1104,9 +1124,11 @@ def run_download_worker(info_json_path, info_json_content, format_to_download, p
if proxy_rename: if proxy_rename:
download_cmd.extend(['--proxy-rename', str(proxy_rename)]) download_cmd.extend(['--proxy-rename', str(proxy_rename)])
# The 'extra_args' from the policy are for the download script itself, not for yt-dlp.
# We need to split them and add them to the command.
extra_args = download_policy.get('extra_args') extra_args = download_policy.get('extra_args')
if extra_args: if extra_args:
download_cmd.extend(['--extra-ytdlp-args', str(extra_args)]) download_cmd.extend(shlex.split(extra_args))
# Pass through downloader settings for yt-dlp to use # Pass through downloader settings for yt-dlp to use
# e.g. to tell yt-dlp to use aria2c as its backend # e.g. to tell yt-dlp to use aria2c as its backend
@ -1227,6 +1249,11 @@ def process_info_json_cycle(path, content, policy, state_manager, proxy_url=None
requested_formats = [f.strip() for f in format_selection.split(',') if f.strip()] requested_formats = [f.strip() for f in format_selection.split(',') if f.strip()]
formats_to_test = [] formats_to_test = []
for req_fmt in requested_formats: for req_fmt in requested_formats:
# If it's a complex selector with slashes, don't try to validate it against available formats.
if '/' in req_fmt:
formats_to_test.append(req_fmt)
continue
# Check for exact match first # Check for exact match first
if req_fmt in available_formats: if req_fmt in available_formats:
formats_to_test.append(req_fmt) formats_to_test.append(req_fmt)
@ -1661,6 +1688,7 @@ Overridable Policy Parameters via --set:
parser.add_argument('--verbose', action='store_true', help='Enable verbose output for the orchestrator and underlying scripts.') parser.add_argument('--verbose', action='store_true', help='Enable verbose output for the orchestrator and underlying scripts.')
parser.add_argument('--dry-run', action='store_true', help='Print the effective policy and exit without running the test.') parser.add_argument('--dry-run', action='store_true', help='Print the effective policy and exit without running the test.')
parser.add_argument('--disable-log-writing', action='store_true', help='Disable writing state, stats, and log files. By default, files are created for each run.')
return parser return parser
@ -1761,11 +1789,6 @@ def main_stress_policy(args):
print_policy_overrides(policy) print_policy_overrides(policy)
return 0 return 0
log_level = logging.DEBUG if args.verbose else logging.INFO
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' if args.verbose else '%(asctime)s - %(message)s'
date_format = None if args.verbose else '%H:%M:%S'
logging.basicConfig(level=log_level, format=log_format, datefmt=date_format, stream=sys.stdout)
policy = load_policy(args.policy, args.policy_name) policy = load_policy(args.policy, args.policy_name)
policy = apply_overrides(policy, args.set) policy = apply_overrides(policy, args.set)
@ -1782,8 +1805,37 @@ def main_stress_policy(args):
policy.setdefault('download_policy', {})['cleanup'] = args.cleanup policy.setdefault('download_policy', {})['cleanup'] = args.cleanup
policy_name = policy.get('name', args.policy_name or Path(args.policy).stem) policy_name = policy.get('name', args.policy_name or Path(args.policy).stem)
# --- Logging Setup ---
log_level = logging.DEBUG if args.verbose else logging.INFO
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' if args.verbose else '%(asctime)s - %(message)s'
date_format = None if args.verbose else '%H:%M:%S'
state_manager = StateManager(policy_name) root_logger = logging.getLogger()
root_logger.setLevel(log_level)
# Remove any existing handlers to avoid duplicate logs
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
# Add console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(logging.Formatter(log_format, datefmt=date_format))
root_logger.addHandler(console_handler)
if not args.disable_log_writing:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_filename = f"stress-policy-{timestamp}-{policy_name}.log"
try:
file_handler = logging.FileHandler(log_filename, encoding='utf-8')
file_handler.setFormatter(logging.Formatter(log_format, datefmt=date_format))
root_logger.addHandler(file_handler)
# Use print because logger is just being set up.
print(f"Logging to file: {log_filename}", file=sys.stderr)
except IOError as e:
print(f"Error: Could not open log file {log_filename}: {e}", file=sys.stderr)
state_manager = StateManager(policy_name, disable_log_writing=args.disable_log_writing)
# --- Graceful shutdown handler --- # --- Graceful shutdown handler ---
def shutdown_handler(signum, frame): def shutdown_handler(signum, frame):
@ -1881,26 +1933,20 @@ def main_stress_policy(args):
logger.error("No sources (URLs or info.json files) to process. Exiting.") logger.error("No sources (URLs or info.json files) to process. Exiting.")
return 1 return 1
# --- Group sources by profile if in download_only mode with regex --- # Grouping of sources by profile is now handled inside the main loop to support continuous mode.
profile_tasks = None
task_items = sources # Default to list of sources
profile_extraction_regex = settings.get('profile_extraction_regex') profile_extraction_regex = settings.get('profile_extraction_regex')
# For 'auto' worker calculation and initial display, we need to group sources once.
# This will be re-calculated inside the loop for continuous mode.
profile_tasks = None
if mode == 'download_only' and profile_extraction_regex: if mode == 'download_only' and profile_extraction_regex:
logger.info(f"Grouping info.json files by profile using regex: {profile_extraction_regex}")
profile_tasks = collections.defaultdict(list) profile_tasks = collections.defaultdict(list)
for source_path in sources: for source_path in sources:
profile_name = get_profile_from_filename(source_path, profile_extraction_regex) profile_name = get_profile_from_filename(source_path, profile_extraction_regex)
if profile_name: if profile_name:
profile_tasks[profile_name].append(source_path) profile_tasks[profile_name].append(source_path)
else: else:
# Assign to a default profile if no match
profile_tasks['unmatched_profile'].append(source_path) profile_tasks['unmatched_profile'].append(source_path)
num_profiles = len(profile_tasks)
logger.info(f"Found {num_profiles} unique profiles. Tasks will be processed sequentially per profile.")
# The new "sources" for the purpose of task distribution are the profiles.
task_items = list(profile_tasks.items())
# --- Auto-calculate workers if needed --- # --- Auto-calculate workers if needed ---
exec_control = policy.get('execution_control', {}) exec_control = policy.get('execution_control', {})
@ -1977,12 +2023,12 @@ def main_stress_policy(args):
# --- Step 1: Get info.json content --- # --- Step 1: Get info.json content ---
info_json_content = None info_json_content = None
profile_name = None
if mode in ['full_stack', 'fetch_only']: if mode in ['full_stack', 'fetch_only']:
gen_policy = policy.get('info_json_generation_policy', {}) gen_policy = policy.get('info_json_generation_policy', {})
cmd_template = gen_policy.get('command_template') cmd_template = gen_policy.get('command_template')
# --- Profile Generation --- # --- Profile Generation ---
profile_name = None
profile_mode = settings.get('profile_mode') profile_mode = settings.get('profile_mode')
pm_policy = settings.get('profile_management') pm_policy = settings.get('profile_management')
@ -2303,6 +2349,28 @@ def main_stress_policy(args):
time.sleep(10) time.sleep(10)
continue continue
# --- Group sources for this cycle ---
task_items = sources
profile_tasks = None
if mode == 'download_only' and profile_extraction_regex:
profile_tasks = collections.defaultdict(list)
for source_path in sources:
profile_name = get_profile_from_filename(source_path, profile_extraction_regex)
if profile_name:
profile_tasks[profile_name].append(source_path)
else:
profile_tasks['unmatched_profile'].append(source_path)
task_items = list(profile_tasks.items())
# If there's nothing to do this cycle, skip.
if not task_items:
if mode == 'download_only' and settings.get('directory_scan_mode') == 'continuous':
# The sleep logic is handled inside the rescanning block.
continue
else:
logger.info("No more sources to process. Ending test.")
break
cycles += 1 cycles += 1
if max_cycles > 0 and cycles > max_cycles: if max_cycles > 0 and cycles > max_cycles:
logger.info(f"Reached max cycles ({max_cycles}). Stopping.") logger.info(f"Reached max cycles ({max_cycles}). Stopping.")