Local changes over multiple fixes, to skip url on bad formats, to allow send to aria2c service, to adopt ban cli policy testing, pass throught lang and headers from airflow dags if needed
This commit is contained in:
parent
302282365e
commit
336438d4cc
126
airflow/aria2-pro-docker/Dockerfile
Normal file
126
airflow/aria2-pro-docker/Dockerfile
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
# _ _ ____ ____
|
||||||
|
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
|
||||||
|
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
|
||||||
|
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
|
||||||
|
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
|
||||||
|
#
|
||||||
|
# https://github.com/P3TERX/Aria2-Pro-Docker
|
||||||
|
#
|
||||||
|
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
|
||||||
|
#
|
||||||
|
# This is free software, licensed under the MIT License.
|
||||||
|
# See /LICENSE for more information.
|
||||||
|
|
||||||
|
# Using Debian Bullseye as a more stable base than EOL Alpine
|
||||||
|
FROM debian:bullseye-slim
|
||||||
|
|
||||||
|
# Install s6-overlay and build aria2 in a single layer to reduce image size
|
||||||
|
# renovate: datasource=github-releases depName=just-containers/s6-overlay
|
||||||
|
ARG S6_OVERLAY_VERSION=v3.1.6.2
|
||||||
|
RUN BUILD_DEPS=" \
|
||||||
|
build-essential \
|
||||||
|
autoconf \
|
||||||
|
automake \
|
||||||
|
autotools-dev \
|
||||||
|
libtool \
|
||||||
|
pkg-config \
|
||||||
|
git \
|
||||||
|
gettext \
|
||||||
|
autopoint \
|
||||||
|
gettext-base \
|
||||||
|
libssl-dev \
|
||||||
|
libssh2-1-dev \
|
||||||
|
libc-ares-dev \
|
||||||
|
libexpat1-dev \
|
||||||
|
libc-ares-dev \
|
||||||
|
vim \
|
||||||
|
libexpat1 \
|
||||||
|
zlib1g-dev \
|
||||||
|
libsqlite3-dev \
|
||||||
|
" && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
jq \
|
||||||
|
findutils \
|
||||||
|
ca-certificates \
|
||||||
|
curl \
|
||||||
|
xz-utils \
|
||||||
|
dos2unix \
|
||||||
|
$BUILD_DEPS && \
|
||||||
|
curl -sSL https://github.com/just-containers/s6-overlay/releases/download/${S6_OVERLAY_VERSION}/s6-overlay-noarch.tar.xz -o /tmp/s6-overlay-noarch.tar.xz && \
|
||||||
|
curl -sSL https://github.com/just-containers/s6-overlay/releases/download/${S6_OVERLAY_VERSION}/s6-overlay-x86_64.tar.xz -o /tmp/s6-overlay-x86_64.tar.xz && \
|
||||||
|
tar -C / -Jxpf /tmp/s6-overlay-noarch.tar.xz && \
|
||||||
|
tar -C / -Jxpf /tmp/s6-overlay-x86_64.tar.xz && \
|
||||||
|
git clone https://github.com/aria2/aria2.git /tmp/aria2 && \
|
||||||
|
cd /tmp/aria2 && \
|
||||||
|
git checkout 8985d66e71f980e7d2765753800078f47761f1ba && \
|
||||||
|
sed -i "s/\"1\", 1, 16, 'x'));/\"1\", 1, 128, 'x'));/" src/OptionHandlerFactory.cc && \
|
||||||
|
autoreconf -i && \
|
||||||
|
./configure \
|
||||||
|
--disable-dependency-tracking \
|
||||||
|
--enable-static \
|
||||||
|
--disable-shared \
|
||||||
|
--with-ca-bundle=/etc/ssl/certs/ca-certificates.crt \
|
||||||
|
--without-libxml2 \
|
||||||
|
--with-libexpat \
|
||||||
|
--without-libgcrypt \
|
||||||
|
--with-openssl \
|
||||||
|
--with-libcares \
|
||||||
|
--with-libsqlite3 \
|
||||||
|
--with-libssh2 \
|
||||||
|
--with-zlib && \
|
||||||
|
make -j$(nproc) && \
|
||||||
|
make install && \
|
||||||
|
cd / && \
|
||||||
|
# No purge runtime dev apt-get purge -y --auto-remove $BUILD_DEPS && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/* /tmp/*
|
||||||
|
|
||||||
|
COPY rootfs /
|
||||||
|
|
||||||
|
RUN find /etc/cont-init.d /etc/services.d -type f -exec dos2unix {} + && \
|
||||||
|
find /etc/cont-init.d /etc/services.d -type f -exec chmod +x {} +
|
||||||
|
|
||||||
|
ENV S6_BEHAVIOUR_IF_STAGE2_FAILS=1 \
|
||||||
|
RCLONE_CONFIG=/config/rclone.conf \
|
||||||
|
UPDATE_TRACKERS=true \
|
||||||
|
CUSTOM_TRACKER_URL= \
|
||||||
|
LISTEN_PORT=6888 \
|
||||||
|
RPC_PORT=6800 \
|
||||||
|
RPC_SECRET= \
|
||||||
|
PUID= PGID= \
|
||||||
|
DISK_CACHE= \
|
||||||
|
IPV6_MODE= \
|
||||||
|
UMASK_SET= \
|
||||||
|
SPECIAL_MODE=
|
||||||
|
|
||||||
|
EXPOSE \
|
||||||
|
6800 \
|
||||||
|
6888 \
|
||||||
|
6888/udp
|
||||||
|
|
||||||
|
VOLUME \
|
||||||
|
/config \
|
||||||
|
/downloads
|
||||||
|
|
||||||
|
#ENTRYPOINT ["/init"]
|
||||||
|
CMD ["aria2c", \
|
||||||
|
"--enable-rpc=true", \
|
||||||
|
"--rpc-listen-all=true", \
|
||||||
|
"--rpc-listen-port=6800", \
|
||||||
|
"--listen-port=6888", \
|
||||||
|
"--disable-ipv6=true", \
|
||||||
|
"--max-concurrent-downloads=128", \
|
||||||
|
"--max-connection-per-server=32", \
|
||||||
|
"--split=6", \
|
||||||
|
"--min-split-size=2M", \
|
||||||
|
"--file-allocation=falloc", \
|
||||||
|
"--continue=false", \
|
||||||
|
"--check-integrity=false", \
|
||||||
|
"--log-level=info", \
|
||||||
|
"--console-log-level=info", \
|
||||||
|
"--save-session-interval=5", \
|
||||||
|
"--dir=/downloads", \
|
||||||
|
"--disk-cache=64M", \
|
||||||
|
"--input-file=/config/aria2.session", \
|
||||||
|
"--save-session=/config/aria2.session"]
|
||||||
17
airflow/aria2-pro-docker/rootfs/Aria2-Pro
Normal file
17
airflow/aria2-pro-docker/rootfs/Aria2-Pro
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
|
||||||
|
----------------------------------------------------------------
|
||||||
|
|
||||||
|
█████╗ ██████╗ ██╗ █████╗ ██████╗ ██████╗ ██████╗ ██████╗
|
||||||
|
██╔══██╗██╔══██╗██║██╔══██╗╚════██╗ ██╔══██╗██╔══██╗██╔═══██╗
|
||||||
|
███████║██████╔╝██║███████║ █████╔╝ ██████╔╝██████╔╝██║ ██║
|
||||||
|
██╔══██║██╔══██╗██║██╔══██║██╔═══╝ ██╔═══╝ ██╔══██╗██║ ██║
|
||||||
|
██║ ██║██║ ██║██║██║ ██║███████╗ ██║ ██║ ██║╚██████╔╝
|
||||||
|
╚═╝ ╚═╝╚═╝ ╚═╝╚═╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝
|
||||||
|
|
||||||
|
https://github.com/P3TERX/Aria2-Pro-Docker
|
||||||
|
|
||||||
|
Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
|
||||||
|
|
||||||
|
Version: COMMIT_HASH | Build Time: DATE_TIME
|
||||||
|
----------------------------------------------------------------
|
||||||
|
|
||||||
39
airflow/aria2-pro-docker/rootfs/etc/cont-init.d/08-config
Normal file
39
airflow/aria2-pro-docker/rootfs/etc/cont-init.d/08-config
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
#!/usr/bin/with-contenv bash
|
||||||
|
# _ _ ____ ____
|
||||||
|
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
|
||||||
|
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
|
||||||
|
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
|
||||||
|
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
|
||||||
|
#
|
||||||
|
# https://github.com/P3TERX/Aria2-Pro-Docker
|
||||||
|
#
|
||||||
|
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
|
||||||
|
#
|
||||||
|
# This is free software, licensed under the MIT License.
|
||||||
|
# See /LICENSE for more information.
|
||||||
|
|
||||||
|
. /etc/init-base
|
||||||
|
|
||||||
|
mkdir -p ${ARIA2_CONF_DIR} ${SCRIPT_DIR} ${DOWNLOAD_DIR}
|
||||||
|
|
||||||
|
PROFILES="
|
||||||
|
aria2.conf
|
||||||
|
"
|
||||||
|
|
||||||
|
DOWNLOAD_PROFILE
|
||||||
|
|
||||||
|
[[ ! -f "${ARIA2_CONF_DIR}/aria2.session" ]] && {
|
||||||
|
rm -rf "${ARIA2_CONF_DIR}/aria2.session"
|
||||||
|
touch "${ARIA2_CONF_DIR}/aria2.session"
|
||||||
|
}
|
||||||
|
|
||||||
|
if ! [[ "${UPDATE_TRACKERS}" = "false" || "${UPDATE_TRACKERS}" = "disable" ]]; then
|
||||||
|
rm -f /etc/services.d/crond/down
|
||||||
|
PROFILES="tracker.sh"
|
||||||
|
DOWNLOAD_PROFILE
|
||||||
|
bash ${SCRIPT_DIR}/tracker.sh ${ARIA2_CONF}
|
||||||
|
else
|
||||||
|
touch /etc/services.d/crond/down
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
||||||
35
airflow/aria2-pro-docker/rootfs/etc/cont-init.d/18-mode
Normal file
35
airflow/aria2-pro-docker/rootfs/etc/cont-init.d/18-mode
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
#!/usr/bin/with-contenv bash
|
||||||
|
# _ _ ____ ____
|
||||||
|
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
|
||||||
|
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
|
||||||
|
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
|
||||||
|
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
|
||||||
|
#
|
||||||
|
# https://github.com/P3TERX/Aria2-Pro-Docker
|
||||||
|
#
|
||||||
|
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
|
||||||
|
#
|
||||||
|
# This is free software, licensed under the MIT License.
|
||||||
|
# See /LICENSE for more information.
|
||||||
|
|
||||||
|
. /etc/init-base
|
||||||
|
|
||||||
|
INSTALL_RCLONE() {
|
||||||
|
if [[ ! -f /usr/local/bin/rclone ]]; then
|
||||||
|
echo
|
||||||
|
echo -e "${INFO} Installing RCLONE ..."
|
||||||
|
[[ -L /usr/bin/unzip ]] && rm -f /usr/bin/unzip
|
||||||
|
curl -fsSL https://rclone.org/install.sh | bash
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ "${SPECIAL_MODE}" = "rclone" ]]; then
|
||||||
|
INSTALL_RCLONE
|
||||||
|
PROFILES="upload.sh rclone.env"
|
||||||
|
DOWNLOAD_PROFILE
|
||||||
|
elif [[ "${SPECIAL_MODE}" = "move" ]]; then
|
||||||
|
PROFILES="move.sh"
|
||||||
|
DOWNLOAD_PROFILE
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
||||||
61
airflow/aria2-pro-docker/rootfs/etc/cont-init.d/28-fix
Normal file
61
airflow/aria2-pro-docker/rootfs/etc/cont-init.d/28-fix
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
#!/usr/bin/with-contenv bash
|
||||||
|
# _ _ ____ ____
|
||||||
|
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
|
||||||
|
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
|
||||||
|
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
|
||||||
|
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
|
||||||
|
#
|
||||||
|
# https://github.com/P3TERX/Aria2-Pro-Docker
|
||||||
|
#
|
||||||
|
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
|
||||||
|
#
|
||||||
|
# This is free software, licensed under the MIT License.
|
||||||
|
# See /LICENSE for more information.
|
||||||
|
|
||||||
|
. /etc/init-base
|
||||||
|
|
||||||
|
[[ -e ${ARIA2_CONF_DIR}/delete.sh ]] && {
|
||||||
|
rm -f ${ARIA2_CONF_DIR}/*.sh
|
||||||
|
sed -i "s@^\(on-download-stop=\).*@\1${SCRIPT_DIR}/delete.sh@" ${ARIA2_CONF}
|
||||||
|
sed -i "s@^\(on-download-complete=\).*@\1${SCRIPT_DIR}/clean.sh@" ${ARIA2_CONF}
|
||||||
|
}
|
||||||
|
|
||||||
|
sed -i "s@^\(dir=\).*@\1/downloads@" ${ARIA2_CONF}
|
||||||
|
sed -i "s@^\(input-file=\).*@\1${ARIA2_CONF_DIR}/aria2.session@" ${ARIA2_CONF}
|
||||||
|
sed -i "s@^\(save-session=\).*@\1${ARIA2_CONF_DIR}/aria2.session@" ${ARIA2_CONF}
|
||||||
|
sed -i "s@^\(dht-file-path=\).*@\1${ARIA2_CONF_DIR}/dht.dat@" ${ARIA2_CONF}
|
||||||
|
sed -i "s@^\(dht-file-path6=\).*@\1${ARIA2_CONF_DIR}/dht6.dat@" ${ARIA2_CONF}
|
||||||
|
|
||||||
|
[[ -e ${ARIA2_CONF_DIR}/HelloWorld ]] && exit 0
|
||||||
|
|
||||||
|
[[ ${RPC_PORT} ]] &&
|
||||||
|
sed -i "s@^\(rpc-listen-port=\).*@\1${RPC_PORT}@" ${ARIA2_CONF}
|
||||||
|
|
||||||
|
[[ ${LISTEN_PORT} ]] && {
|
||||||
|
sed -i "s@^\(listen-port=\).*@\1${LISTEN_PORT}@" ${ARIA2_CONF}
|
||||||
|
sed -i "s@^\(dht-listen-port=\).*@\1${LISTEN_PORT}@" ${ARIA2_CONF}
|
||||||
|
}
|
||||||
|
|
||||||
|
[[ ${RPC_SECRET} ]] &&
|
||||||
|
sed -i "s@^\(rpc-secret=\).*@\1${RPC_SECRET}@" ${ARIA2_CONF}
|
||||||
|
|
||||||
|
[[ ${DISK_CACHE} ]] &&
|
||||||
|
sed -i "s@^\(disk-cache=\).*@\1${DISK_CACHE}@" ${ARIA2_CONF}
|
||||||
|
|
||||||
|
[[ "${IPV6_MODE}" = "true" || "${IPV6_MODE}" = "enable" ]] && {
|
||||||
|
sed -i "s@^\(disable-ipv6=\).*@\1false@" ${ARIA2_CONF}
|
||||||
|
sed -i "s@^\(enable-dht6=\).*@\1true@" ${ARIA2_CONF}
|
||||||
|
}
|
||||||
|
|
||||||
|
[[ "${IPV6_MODE}" = "false" || "${IPV6_MODE}" = "disable" ]] && {
|
||||||
|
sed -i "s@^\(disable-ipv6=\).*@\1true@" ${ARIA2_CONF}
|
||||||
|
sed -i "s@^\(enable-dht6=\).*@\1false@" ${ARIA2_CONF}
|
||||||
|
}
|
||||||
|
|
||||||
|
[[ "${SPECIAL_MODE}" = "rclone" ]] &&
|
||||||
|
sed -i "s@^\(on-download-complete=\).*@\1${SCRIPT_DIR}/upload.sh@" ${ARIA2_CONF}
|
||||||
|
|
||||||
|
[[ "${SPECIAL_MODE}" = "move" ]] &&
|
||||||
|
sed -i "s@^\(on-download-complete=\).*@\1${SCRIPT_DIR}/move.sh@" ${ARIA2_CONF}
|
||||||
|
|
||||||
|
exit 0
|
||||||
@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/with-contenv bash
|
||||||
|
# _ _ ____ ____
|
||||||
|
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
|
||||||
|
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
|
||||||
|
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
|
||||||
|
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
|
||||||
|
#
|
||||||
|
# https://github.com/P3TERX/Aria2-Pro-Docker
|
||||||
|
#
|
||||||
|
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
|
||||||
|
#
|
||||||
|
# This is free software, licensed under the MIT License.
|
||||||
|
# See /LICENSE for more information.
|
||||||
|
|
||||||
|
. /etc/init-base
|
||||||
|
if [ -w ${DOWNLOAD_DIR} ]; then echo "Download DIR writeable, not changing owner."; else chown -R p3terx:p3terx ${DOWNLOAD_DIR}; fi
|
||||||
|
chown -R p3terx:p3terx ${ARIA2_CONF_DIR}
|
||||||
|
if [[ -z ${PUID} && -z ${PGID} ]] || [[ ${PUID} = 65534 && ${PGID} = 65534 ]]; then
|
||||||
|
echo -e "${WARN} Ignore permission settings."
|
||||||
|
chmod -v 777 ${DOWNLOAD_DIR}
|
||||||
|
chmod -vR 777 ${ARIA2_CONF_DIR}
|
||||||
|
else
|
||||||
|
if [ -w ${DOWNLOAD_DIR} ]; then echo "Download DIR writeable, not modifying permission."; else chmod -v u=rwx ${DOWNLOAD_DIR}; fi
|
||||||
|
chmod -v 600 ${ARIA2_CONF_DIR}/*
|
||||||
|
chmod -v 755 ${SCRIPT_DIR}
|
||||||
|
chmod -v 700 ${SCRIPT_DIR}/*
|
||||||
|
fi
|
||||||
2
airflow/aria2-pro-docker/rootfs/etc/cont-init.d/88-done
Normal file
2
airflow/aria2-pro-docker/rootfs/etc/cont-init.d/88-done
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
cat /Aria2-Pro
|
||||||
1
airflow/aria2-pro-docker/rootfs/etc/crontabs/p3terx
Normal file
1
airflow/aria2-pro-docker/rootfs/etc/crontabs/p3terx
Normal file
@ -0,0 +1 @@
|
|||||||
|
# BT tracker updates disabled.
|
||||||
118
airflow/aria2-pro-docker/rootfs/etc/init-base
Normal file
118
airflow/aria2-pro-docker/rootfs/etc/init-base
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
# _ _ ____ ____
|
||||||
|
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
|
||||||
|
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
|
||||||
|
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
|
||||||
|
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
|
||||||
|
#
|
||||||
|
# https://github.com/P3TERX/Docker-Aria2-Pro
|
||||||
|
#
|
||||||
|
# Copyright (c) 2020 P3TERX <https://p3terx.com>
|
||||||
|
#
|
||||||
|
# This is free software, licensed under the MIT License.
|
||||||
|
# See /LICENSE for more information.
|
||||||
|
|
||||||
|
Green_font_prefix="\033[32m"
|
||||||
|
Red_font_prefix="\033[31m"
|
||||||
|
Green_background_prefix="\033[42;37m"
|
||||||
|
Red_background_prefix="\033[41;37m"
|
||||||
|
Font_color_suffix="\033[0m"
|
||||||
|
INFO="[${Green_font_prefix}INFO${Font_color_suffix}]"
|
||||||
|
ERROR="[${Red_font_prefix}ERROR${Font_color_suffix}]"
|
||||||
|
WARN="[${Yellow_font_prefix}WARN${Font_color_suffix}]"
|
||||||
|
DOWNLOAD_DIR="/downloads"
|
||||||
|
ARIA2_CONF_DIR="/config"
|
||||||
|
ARIA2_CONF="${ARIA2_CONF_DIR}/aria2.conf"
|
||||||
|
SCRIPT_CONF="${ARIA2_CONF_DIR}/script.conf"
|
||||||
|
SCRIPT_DIR="${ARIA2_CONF_DIR}/script"
|
||||||
|
CURL_OPTIONS="-fsSL --connect-timeout 3 --max-time 3"
|
||||||
|
PROFILE_URL1="https://p3terx.github.io/aria2.conf"
|
||||||
|
PROFILE_URL2="https://aria2c.now.sh"
|
||||||
|
PROFILE_URL3="https://cdn.jsdelivr.net/gh/P3TERX/aria2.conf"
|
||||||
|
|
||||||
|
FILE_ALLOCATION_SET() {
|
||||||
|
TMP_FILE="/downloads/P3TERX.COM"
|
||||||
|
if fallocate -l 5G ${TMP_FILE}; then
|
||||||
|
FILE_ALLOCATION=falloc
|
||||||
|
else
|
||||||
|
FILE_ALLOCATION=none
|
||||||
|
fi
|
||||||
|
rm -f ${TMP_FILE}
|
||||||
|
sed -i "s@^\(file-allocation=\).*@\1${FILE_ALLOCATION}@" "${ARIA2_CONF}"
|
||||||
|
}
|
||||||
|
|
||||||
|
CONVERSION_ARIA2_CONF() {
|
||||||
|
sed -i "s@^\(rpc-listen-port=\).*@\1${RPC_PORT:-6800}@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(listen-port=\).*@\1${LISTEN_PORT:-6888}@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(dht-listen-port=\).*@\1${LISTEN_PORT:-6888}@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(dir=\).*@\1/downloads@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@/root/.aria2@${ARIA2_CONF_DIR}@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^#\(retry-on-.*=\).*@\1true@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(max-connection-per-server=\).*@\1128@" "${ARIA2_CONF}"
|
||||||
|
sed -i "/^on-download-stop=/d" "${ARIA2_CONF}"
|
||||||
|
sed -i "/^on-download-complete=/d" "${ARIA2_CONF}"
|
||||||
|
|
||||||
|
# Custom settings from user
|
||||||
|
sed -i "s@^\(continue=\).*@\1false@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(always-resume=\).*@\1false@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(max-concurrent-downloads=\).*@\1500@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(enable-dht=\).*@\1false@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(enable-dht6=\).*@\1false@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(bt-enable-lpd=\).*@\1true@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(enable-peer-exchange=\).*@\1false@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(max-overall-upload-limit=\).*@\12M@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(seed-time=\).*@\11@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(user-agent=\).*@\1Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(peer-id-prefix=\).*@\1-DE13F0-@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(summary-interval=\).*@\11@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(show-console-readout=\).*@\1false@" "${ARIA2_CONF}"
|
||||||
|
sed -i "s@^\(console-log-level=\).*@\1notice@" "${ARIA2_CONF}"
|
||||||
|
|
||||||
|
# Add settings not present in default config
|
||||||
|
echo "" >>"${ARIA2_CONF}"
|
||||||
|
echo "# Custom settings added" >>"${ARIA2_CONF}"
|
||||||
|
echo "disable-metalink=true" >>"${ARIA2_CONF}"
|
||||||
|
echo "follow-torrent=false" >>"${ARIA2_CONF}"
|
||||||
|
echo "retry-on-400=false" >>"${ARIA2_CONF}"
|
||||||
|
echo "retry-on-403=false" >>"${ARIA2_CONF}"
|
||||||
|
echo "retry-on-406=false" >>"${ARIA2_CONF}"
|
||||||
|
echo "retry-on-unknown=true" >>"${ARIA2_CONF}"
|
||||||
|
echo "rpc-listen-all=true" >>"${ARIA2_CONF}"
|
||||||
|
|
||||||
|
[[ $TZ != "Asia/Shanghai" ]] && sed -i '11,$s/#.*//;/^$/d' "${ARIA2_CONF}"
|
||||||
|
FILE_ALLOCATION_SET
|
||||||
|
}
|
||||||
|
|
||||||
|
CONVERSION_SCRIPT_CONF() {
|
||||||
|
sed -i "s@\(upload-log=\).*@\1${ARIA2_CONF_DIR}/upload.log@" "${SCRIPT_CONF}"
|
||||||
|
sed -i "s@\(move-log=\).*@\1${ARIA2_CONF_DIR}/move.log@" "${SCRIPT_CONF}"
|
||||||
|
sed -i "s@^\(dest-dir=\).*@\1${DOWNLOAD_DIR}/completed@" "${SCRIPT_CONF}"
|
||||||
|
}
|
||||||
|
|
||||||
|
CONVERSION_CORE() {
|
||||||
|
sed -i "s@\(ARIA2_CONF_DIR=\"\).*@\1${ARIA2_CONF_DIR}\"@" "${SCRIPT_DIR}/core"
|
||||||
|
}
|
||||||
|
|
||||||
|
DOWNLOAD_PROFILE() {
|
||||||
|
for PROFILE in ${PROFILES}; do
|
||||||
|
[[ ${PROFILE} = *.sh || ${PROFILE} = core ]] && cd "${SCRIPT_DIR}" || cd "${ARIA2_CONF_DIR}"
|
||||||
|
while [[ ! -f ${PROFILE} ]]; do
|
||||||
|
rm -rf ${PROFILE}
|
||||||
|
echo
|
||||||
|
echo -e "${INFO} Downloading '${PROFILE}' ..."
|
||||||
|
curl -O ${CURL_OPTIONS} ${PROFILE_URL1}/${PROFILE} ||
|
||||||
|
curl -O ${CURL_OPTIONS} ${PROFILE_URL2}/${PROFILE} ||
|
||||||
|
curl -O ${CURL_OPTIONS} ${PROFILE_URL3}/${PROFILE}
|
||||||
|
[[ -s ${PROFILE} ]] && {
|
||||||
|
[[ "${PROFILE}" = "aria2.conf" ]] && CONVERSION_ARIA2_CONF
|
||||||
|
[[ "${PROFILE}" = "script.conf" ]] && CONVERSION_SCRIPT_CONF
|
||||||
|
[[ "${PROFILE}" = "core" ]] && CONVERSION_CORE
|
||||||
|
echo
|
||||||
|
echo -e "${INFO} '${PROFILE}' download completed !"
|
||||||
|
} || {
|
||||||
|
echo
|
||||||
|
echo -e "${ERROR} '${PROFILE}' download error, retry ..."
|
||||||
|
sleep 3
|
||||||
|
}
|
||||||
|
done
|
||||||
|
done
|
||||||
|
}
|
||||||
15
airflow/aria2-pro-docker/rootfs/etc/services.d/aria2/finish
Normal file
15
airflow/aria2-pro-docker/rootfs/etc/services.d/aria2/finish
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
#!/usr/bin/execlineb -S0
|
||||||
|
# _ _ ____ ____
|
||||||
|
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
|
||||||
|
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
|
||||||
|
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
|
||||||
|
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
|
||||||
|
#
|
||||||
|
# https://github.com/P3TERX/Aria2-Pro-Docker
|
||||||
|
#
|
||||||
|
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
|
||||||
|
#
|
||||||
|
# This is free software, licensed under the MIT License.
|
||||||
|
# See /LICENSE for more information.
|
||||||
|
|
||||||
|
s6-svscanctl -t /var/run/s6/services
|
||||||
18
airflow/aria2-pro-docker/rootfs/etc/services.d/aria2/run
Normal file
18
airflow/aria2-pro-docker/rootfs/etc/services.d/aria2/run
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#!/usr/bin/with-contenv bash
|
||||||
|
# _ _ ____ ____
|
||||||
|
# / \ _ __(_) __ _|___ \ | _ \ _ __ ___
|
||||||
|
# / _ \ | '__| |/ _` | __) | | |_) | '__/ _ \
|
||||||
|
# / ___ \| | | | (_| |/ __/ | __/| | | (_) |
|
||||||
|
# /_/ \_\_| |_|\__,_|_____| |_| |_| \___/
|
||||||
|
#
|
||||||
|
# https://github.com/P3TERX/Aria2-Pro-Docker
|
||||||
|
#
|
||||||
|
# Copyright (c) 2020-2021 P3TERX <https://p3terx.com>
|
||||||
|
#
|
||||||
|
# This is free software, licensed under the MIT License.
|
||||||
|
# See /LICENSE for more information.
|
||||||
|
|
||||||
|
umask ${UMASK_SET:-022}
|
||||||
|
|
||||||
|
exec s6-setuidgid p3terx aria2c \
|
||||||
|
--conf-path=/config/aria2.conf
|
||||||
@ -260,6 +260,37 @@ services:
|
|||||||
- proxynet
|
- proxynet
|
||||||
restart: always
|
restart: always
|
||||||
|
|
||||||
|
aria2-pro:
|
||||||
|
container_name: aria2-pro
|
||||||
|
build:
|
||||||
|
context: "{{ airflow_worker_dir }}/aria2-pro-docker"
|
||||||
|
environment:
|
||||||
|
- PUID=${AIRFLOW_UID:-50000}
|
||||||
|
- PGID=0
|
||||||
|
- UMASK_SET=022
|
||||||
|
- RPC_SECRET={{ vault_aria2_rpc_secret }}
|
||||||
|
- RPC_PORT=6800
|
||||||
|
- LISTEN_PORT=6888
|
||||||
|
- DISK_CACHE=64M
|
||||||
|
- IPV6_MODE=false
|
||||||
|
- UPDATE_TRACKERS=false
|
||||||
|
- CUSTOM_TRACKER_URL=
|
||||||
|
- TZ=Asia/Shanghai
|
||||||
|
volumes:
|
||||||
|
- ${AIRFLOW_PROJ_DIR:-.}/aria2-config:/config
|
||||||
|
- ${AIRFLOW_PROJ_DIR:-.}/downloadfiles/videos/in-progress:/downloads
|
||||||
|
ports:
|
||||||
|
- "127.0.0.1:6800:6800"
|
||||||
|
- "6888:6888"
|
||||||
|
- "6888:6888/udp"
|
||||||
|
networks:
|
||||||
|
- proxynet
|
||||||
|
restart: unless-stopped
|
||||||
|
logging:
|
||||||
|
driver: json-file
|
||||||
|
options:
|
||||||
|
max-size: 1m
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
proxynet:
|
proxynet:
|
||||||
name: airflow_proxynet
|
name: airflow_proxynet
|
||||||
|
|||||||
@ -132,6 +132,8 @@ services:
|
|||||||
- "--comms-log-root-dir"
|
- "--comms-log-root-dir"
|
||||||
- "/app/logs/yt-dlp-ops/communication_logs"
|
- "/app/logs/yt-dlp-ops/communication_logs"
|
||||||
- "--bgutils-no-innertube"
|
- "--bgutils-no-innertube"
|
||||||
|
- "--visitor-rotation-threshold"
|
||||||
|
- "250"
|
||||||
{% endif %}
|
{% endif %}
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
pull_policy: always
|
pull_policy: always
|
||||||
|
|||||||
@ -327,7 +327,7 @@ def manage_system_callable(**context):
|
|||||||
action = params["action"]
|
action = params["action"]
|
||||||
|
|
||||||
# For Thrift actions, use the new management host/port
|
# For Thrift actions, use the new management host/port
|
||||||
if entity not in ["airflow_meta", "activity_counters"]:
|
if entity not in ["activity_counters"]:
|
||||||
host = params["management_host"]
|
host = params["management_host"]
|
||||||
port = params["management_port"]
|
port = params["management_port"]
|
||||||
else:
|
else:
|
||||||
@ -343,7 +343,6 @@ def manage_system_callable(**context):
|
|||||||
"account": ["list_with_status", "ban", "unban", "unban_all", "delete_from_redis"],
|
"account": ["list_with_status", "ban", "unban", "unban_all", "delete_from_redis"],
|
||||||
"client": ["list_with_status", "delete_from_redis"],
|
"client": ["list_with_status", "delete_from_redis"],
|
||||||
"accounts_and_proxies": ["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis"],
|
"accounts_and_proxies": ["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis"],
|
||||||
"airflow_meta": ["clear_dag_runs"],
|
|
||||||
"activity_counters": ["list_with_status"],
|
"activity_counters": ["list_with_status"],
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -364,41 +363,6 @@ def manage_system_callable(**context):
|
|||||||
if action in ["ban", "unban"] and not account_id:
|
if action in ["ban", "unban"] and not account_id:
|
||||||
raise ValueError(f"An 'account_id' is required for account action '{action}'.")
|
raise ValueError(f"An 'account_id' is required for account action '{action}'.")
|
||||||
|
|
||||||
# --- Handle Airflow Meta actions separately as they don't use Thrift ---
|
|
||||||
if entity == "airflow_meta":
|
|
||||||
dag_id = params.get("dag_id_to_manage")
|
|
||||||
|
|
||||||
if action == "clear_dag_runs":
|
|
||||||
clear_scope = params.get("clear_scope")
|
|
||||||
logger.info(f"Attempting to delete DagRuns for DAG '{dag_id}' with scope '{clear_scope}'.")
|
|
||||||
|
|
||||||
with create_session() as session:
|
|
||||||
dag_run_query = session.query(DagRun).filter(DagRun.dag_id == dag_id)
|
|
||||||
|
|
||||||
if clear_scope == "last_run":
|
|
||||||
last_run = dag_run_query.order_by(DagRun.execution_date.desc()).first()
|
|
||||||
if not last_run:
|
|
||||||
logger.info(f"No runs found for DAG '{dag_id}'. Nothing to delete.")
|
|
||||||
print(f"\nNo runs found for DAG '{dag_id}'.\n")
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.warning(f"Deleting last DagRun for DAG '{dag_id}' (run_id: {last_run.run_id}, execution_date: {last_run.execution_date}). This will also delete its task instances.")
|
|
||||||
# Deleting the DagRun object should cascade and delete related TaskInstances.
|
|
||||||
session.delete(last_run)
|
|
||||||
deleted_count = 1
|
|
||||||
else: # all_runs
|
|
||||||
logger.warning(f"Deleting ALL DagRuns and associated TaskInstances for DAG '{dag_id}'. This will remove all history from the UI.")
|
|
||||||
# To ensure all related data is cleared, we explicitly delete TaskInstances first.
|
|
||||||
# This is safer than relying on DB-level cascades which may not be configured.
|
|
||||||
ti_deleted_count = session.query(TaskInstance).filter(TaskInstance.dag_id == dag_id).delete(synchronize_session=False)
|
|
||||||
logger.info(f"Deleted {ti_deleted_count} TaskInstance records for DAG '{dag_id}'.")
|
|
||||||
|
|
||||||
deleted_count = dag_run_query.delete(synchronize_session=False)
|
|
||||||
|
|
||||||
# The session is committed automatically by the `with create_session()` context manager.
|
|
||||||
logger.info(f"Successfully deleted {deleted_count} DagRun(s) for DAG '{dag_id}'.")
|
|
||||||
print(f"\nSuccessfully deleted {deleted_count} DagRun(s) for DAG '{dag_id}'.\n")
|
|
||||||
return # End execution
|
|
||||||
|
|
||||||
# --- Handle Activity Counter action ---
|
# --- Handle Activity Counter action ---
|
||||||
if entity == "activity_counters":
|
if entity == "activity_counters":
|
||||||
@ -855,13 +819,13 @@ with DAG(
|
|||||||
"entity": Param(
|
"entity": Param(
|
||||||
"accounts_and_proxies",
|
"accounts_and_proxies",
|
||||||
type="string",
|
type="string",
|
||||||
enum=["account", "proxy", "client", "accounts_and_proxies", "activity_counters", "airflow_meta"],
|
enum=["account", "proxy", "client", "accounts_and_proxies", "activity_counters"],
|
||||||
description="The type of entity to manage.",
|
description="The type of entity to manage.",
|
||||||
),
|
),
|
||||||
"action": Param(
|
"action": Param(
|
||||||
"list_with_status",
|
"list_with_status",
|
||||||
type="string",
|
type="string",
|
||||||
enum=["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis", "clear_dag_runs"],
|
enum=["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis"],
|
||||||
description="""The management action to perform.
|
description="""The management action to perform.
|
||||||
---
|
---
|
||||||
#### Actions for `entity: proxy`
|
#### Actions for `entity: proxy`
|
||||||
@ -895,10 +859,6 @@ with DAG(
|
|||||||
- `unban_all`: Un-ban all proxies for a `server_identity` (or all servers) AND all accounts (optionally filtered by `account_id` as a prefix).
|
- `unban_all`: Un-ban all proxies for a `server_identity` (or all servers) AND all accounts (optionally filtered by `account_id` as a prefix).
|
||||||
- `delete_from_redis`: Deletes both account and proxy status from Redis via Thrift service. For accounts, if `account_id` is provided as a prefix, it deletes all accounts matching that prefix. If `account_id` is empty, it deletes ALL accounts. For proxies, if `server_identity` is provided, it deletes all proxies for that server. If `server_identity` is empty, it deletes ALL proxies across all servers.
|
- `delete_from_redis`: Deletes both account and proxy status from Redis via Thrift service. For accounts, if `account_id` is provided as a prefix, it deletes all accounts matching that prefix. If `account_id` is empty, it deletes ALL accounts. For proxies, if `server_identity` is provided, it deletes all proxies for that server. If `server_identity` is empty, it deletes ALL proxies across all servers.
|
||||||
|
|
||||||
#### Actions for `entity: airflow_meta`
|
|
||||||
- `clear_dag_runs`: **(Destructive)** Deletes DAG run history and associated task instances from the database, removing them from the UI. This allows the runs to be re-created if backfilling is enabled.
|
|
||||||
- `clear_scope: last_run`: Deletes only the most recent DAG run and its task instances.
|
|
||||||
- `clear_scope: all_runs`: Deletes all historical DAG runs and task instances for the selected DAG.
|
|
||||||
""",
|
""",
|
||||||
),
|
),
|
||||||
"server_identity": Param(
|
"server_identity": Param(
|
||||||
@ -922,20 +882,6 @@ with DAG(
|
|||||||
title="Redis Connection ID",
|
title="Redis Connection ID",
|
||||||
description="The Airflow connection ID for the Redis server (used for 'delete_from_redis' and for fetching detailed account status).",
|
description="The Airflow connection ID for the Redis server (used for 'delete_from_redis' and for fetching detailed account status).",
|
||||||
),
|
),
|
||||||
"dag_id_to_manage": Param(
|
|
||||||
"ytdlp_ops_v01_worker_per_url",
|
|
||||||
type="string",
|
|
||||||
enum=["ytdlp_ops_v01_orchestrator", "ytdlp_ops_v01_dispatcher", "ytdlp_ops_v01_worker_per_url", "ytdlp_ops_v02_orchestrator_auth", "ytdlp_ops_v02_dispatcher_auth", "ytdlp_ops_v02_worker_per_url_auth", "ytdlp_ops_v02_orchestrator_dl", "ytdlp_ops_v02_dispatcher_dl", "ytdlp_ops_v02_worker_per_url_dl"],
|
|
||||||
title="[Airflow Meta] DAG ID",
|
|
||||||
description="The DAG ID to perform the action on.",
|
|
||||||
),
|
|
||||||
"clear_scope": Param(
|
|
||||||
"last_run",
|
|
||||||
type="string",
|
|
||||||
enum=["last_run", "all_runs"],
|
|
||||||
title="[Airflow Meta] Clear Scope",
|
|
||||||
description="For 'clear_dag_runs' action, specifies the scope of runs to clear.",
|
|
||||||
),
|
|
||||||
},
|
},
|
||||||
) as dag:
|
) as dag:
|
||||||
system_management_task = PythonOperator(
|
system_management_task = PythonOperator(
|
||||||
|
|||||||
@ -15,7 +15,9 @@ from datetime import datetime
|
|||||||
|
|
||||||
from airflow.exceptions import AirflowException
|
from airflow.exceptions import AirflowException
|
||||||
from airflow.models.dag import DAG
|
from airflow.models.dag import DAG
|
||||||
|
from airflow.models.dagrun import DagRun
|
||||||
from airflow.models.param import Param
|
from airflow.models.param import Param
|
||||||
|
from airflow.models.taskinstance import TaskInstance
|
||||||
from airflow.operators.python import PythonOperator, BranchPythonOperator
|
from airflow.operators.python import PythonOperator, BranchPythonOperator
|
||||||
from airflow.operators.empty import EmptyOperator
|
from airflow.operators.empty import EmptyOperator
|
||||||
from airflow.operators.bash import BashOperator
|
from airflow.operators.bash import BashOperator
|
||||||
@ -23,6 +25,7 @@ from airflow.providers.celery.executors.celery_executor import app as celery_app
|
|||||||
from airflow.providers.redis.hooks.redis import RedisHook
|
from airflow.providers.redis.hooks.redis import RedisHook
|
||||||
from airflow.utils.dates import days_ago
|
from airflow.utils.dates import days_ago
|
||||||
from airflow.models.variable import Variable
|
from airflow.models.variable import Variable
|
||||||
|
from airflow.utils.session import create_session
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
@ -276,7 +279,10 @@ def dump_redis_data_to_csv(redis_client, dump_dir, patterns):
|
|||||||
|
|
||||||
|
|
||||||
def clear_queue_callable(**context):
|
def clear_queue_callable(**context):
|
||||||
"""Dumps Redis data to CSV and/or clears specified Redis keys based on selection."""
|
"""
|
||||||
|
Dumps Redis data to CSV and/or clears specified Redis keys based on selection.
|
||||||
|
The `_skipped` queue is for videos that are unavailable due to external reasons (e.g., private, removed).
|
||||||
|
"""
|
||||||
params = context['params']
|
params = context['params']
|
||||||
ti = context['task_instance']
|
ti = context['task_instance']
|
||||||
logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
|
logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
|
||||||
@ -315,7 +321,7 @@ def clear_queue_callable(**context):
|
|||||||
logger.info("Dumping is enabled. Performing dump before clearing.")
|
logger.info("Dumping is enabled. Performing dump before clearing.")
|
||||||
dump_redis_data_to_csv(redis_client, dump_dir, dump_patterns)
|
dump_redis_data_to_csv(redis_client, dump_dir, dump_patterns)
|
||||||
|
|
||||||
all_suffixes = ['_inbox', '_fail', '_result', '_progress']
|
all_suffixes = ['_inbox', '_fail', '_result', '_progress', '_skipped']
|
||||||
keys_to_delete = set()
|
keys_to_delete = set()
|
||||||
for queue_base_name in queue_base_names_to_clear:
|
for queue_base_name in queue_base_names_to_clear:
|
||||||
if '_all' in queues_to_clear_options:
|
if '_all' in queues_to_clear_options:
|
||||||
@ -420,7 +426,10 @@ def list_contents_callable(**context):
|
|||||||
|
|
||||||
|
|
||||||
def check_status_callable(**context):
|
def check_status_callable(**context):
|
||||||
"""Checks the status (type and size) of all standard Redis queues for a given base name."""
|
"""
|
||||||
|
Checks the status (type and size) of all standard Redis queues for a given base name.
|
||||||
|
The `_skipped` queue is for videos that are unavailable due to external reasons (e.g., private, removed).
|
||||||
|
"""
|
||||||
params = context['params']
|
params = context['params']
|
||||||
ti = context['task_instance']
|
ti = context['task_instance']
|
||||||
logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
|
logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
|
||||||
@ -436,7 +445,7 @@ def check_status_callable(**context):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid queue_system: {queue_system}")
|
raise ValueError(f"Invalid queue_system: {queue_system}")
|
||||||
|
|
||||||
queue_suffixes = ['_inbox', '_progress', '_result', '_fail']
|
queue_suffixes = ['_inbox', '_progress', '_result', '_fail', '_skipped']
|
||||||
|
|
||||||
logger.info(f"--- Checking Status for Queue System: '{queue_system}' ---")
|
logger.info(f"--- Checking Status for Queue System: '{queue_system}' ---")
|
||||||
|
|
||||||
@ -575,6 +584,56 @@ def purge_celery_queue_callable(**context):
|
|||||||
logger.info("--- Purge complete. ---")
|
logger.info("--- Purge complete. ---")
|
||||||
|
|
||||||
|
|
||||||
|
def clear_dag_runs_callable(**context):
|
||||||
|
"""
|
||||||
|
Deletes DAG run history and associated task instances from the database.
|
||||||
|
"""
|
||||||
|
params = context['params']
|
||||||
|
dag_id = params.get("dag_id_to_manage")
|
||||||
|
clear_scope = params.get("clear_scope")
|
||||||
|
|
||||||
|
log_target = f"DAG '{dag_id}'" if dag_id != "ALL_DAGS" else "ALL DAGS (except ytdlp_mgmt_queues)"
|
||||||
|
logger.info(f"Attempting to delete DagRuns for {log_target} with scope '{clear_scope}'.")
|
||||||
|
|
||||||
|
with create_session() as session:
|
||||||
|
dag_run_query = session.query(DagRun)
|
||||||
|
if dag_id == "ALL_DAGS":
|
||||||
|
dag_run_query = dag_run_query.filter(DagRun.dag_id != 'ytdlp_mgmt_queues')
|
||||||
|
else:
|
||||||
|
dag_run_query = dag_run_query.filter(DagRun.dag_id == dag_id)
|
||||||
|
|
||||||
|
if clear_scope == "last_run":
|
||||||
|
if dag_id == "ALL_DAGS":
|
||||||
|
raise AirflowException("Cannot clear 'last_run' for ALL_DAGS. Please select a specific DAG.")
|
||||||
|
|
||||||
|
last_run = dag_run_query.order_by(DagRun.execution_date.desc()).first()
|
||||||
|
if not last_run:
|
||||||
|
logger.info(f"No runs found for DAG '{dag_id}'. Nothing to delete.")
|
||||||
|
print(f"\nNo runs found for DAG '{dag_id}'.\n")
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.warning(f"Deleting last DagRun for DAG '{dag_id}' (run_id: {last_run.run_id}, execution_date: {last_run.execution_date}). This will also delete its task instances.")
|
||||||
|
session.delete(last_run)
|
||||||
|
deleted_count = 1
|
||||||
|
else: # all_runs
|
||||||
|
logger.warning(f"Deleting ALL DagRuns and associated TaskInstances for {log_target}. This will remove all history from the UI.")
|
||||||
|
|
||||||
|
ti_query = session.query(TaskInstance)
|
||||||
|
if dag_id == "ALL_DAGS":
|
||||||
|
ti_query = ti_query.filter(TaskInstance.dag_id != 'ytdlp_mgmt_queues')
|
||||||
|
else:
|
||||||
|
ti_query = ti_query.filter(TaskInstance.dag_id == dag_id)
|
||||||
|
|
||||||
|
ti_deleted_count = ti_query.delete(synchronize_session=False)
|
||||||
|
logger.info(f"Deleted {ti_deleted_count} TaskInstance records for {log_target}.")
|
||||||
|
|
||||||
|
deleted_count = dag_run_query.delete(synchronize_session=False)
|
||||||
|
|
||||||
|
# The session is committed automatically by the `with create_session()` context manager.
|
||||||
|
logger.info(f"Successfully deleted {deleted_count} DagRun(s) for {log_target}.")
|
||||||
|
print(f"\nSuccessfully deleted {deleted_count} DagRun(s) for {log_target}.\n")
|
||||||
|
|
||||||
|
|
||||||
def add_videos_to_queue_callable(**context):
|
def add_videos_to_queue_callable(**context):
|
||||||
"""
|
"""
|
||||||
Parses video inputs from manual text, a predefined file, or a file path/URL,
|
Parses video inputs from manual text, a predefined file, or a file path/URL,
|
||||||
@ -671,12 +730,13 @@ with DAG(
|
|||||||
- `check_status`: Check the overall status of the queues.
|
- `check_status`: Check the overall status of the queues.
|
||||||
- `requeue_failed`: Copy all URLs from the `_fail` hash to the `_inbox` list and clear the `_fail` hash.
|
- `requeue_failed`: Copy all URLs from the `_fail` hash to the `_inbox` list and clear the `_fail` hash.
|
||||||
- `purge_celery_queue`: **(Destructive)** Removes all tasks from a specified Celery worker queue (e.g., `queue-dl`). This is useful for clearing out a backlog of tasks that were queued before a dispatcher was paused.
|
- `purge_celery_queue`: **(Destructive)** Removes all tasks from a specified Celery worker queue (e.g., `queue-dl`). This is useful for clearing out a backlog of tasks that were queued before a dispatcher was paused.
|
||||||
|
- `clear_dag_runs`: **(Destructive)** Deletes DAG run history and associated task instances from the database, removing them from the UI.
|
||||||
""",
|
""",
|
||||||
params={
|
params={
|
||||||
"action": Param(
|
"action": Param(
|
||||||
"list_contents",
|
"list_contents",
|
||||||
type="string",
|
type="string",
|
||||||
enum=["add_videos", "clear_queue", "list_contents", "check_status", "requeue_failed", "inspect_celery_cluster", "purge_celery_queue"],
|
enum=["add_videos", "clear_queue", "list_contents", "check_status", "requeue_failed", "inspect_celery_cluster", "purge_celery_queue", "clear_dag_runs"],
|
||||||
title="Action",
|
title="Action",
|
||||||
description="The management action to perform.",
|
description="The management action to perform.",
|
||||||
),
|
),
|
||||||
@ -737,7 +797,7 @@ with DAG(
|
|||||||
description="Select which standard queues to clear. '_all' clears all four. If left empty, it defaults to '_all'.",
|
description="Select which standard queues to clear. '_all' clears all four. If left empty, it defaults to '_all'.",
|
||||||
items={
|
items={
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": ["_inbox", "_fail", "_result", "_progress", "_all"],
|
"enum": ["_inbox", "_fail", "_result", "_progress", "_skipped", "_all"],
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
"confirm_clear": Param(
|
"confirm_clear": Param(
|
||||||
@ -766,7 +826,7 @@ with DAG(
|
|||||||
),
|
),
|
||||||
# --- Params for 'list_contents' ---
|
# --- Params for 'list_contents' ---
|
||||||
"queue_to_list": Param(
|
"queue_to_list": Param(
|
||||||
'video_queue_inbox,queue2_auth_inbox,queue2_dl_result',
|
'video_queue_inbox,queue2_auth_inbox,queue2_dl_inbox,queue2_dl_result',
|
||||||
type="string",
|
type="string",
|
||||||
title="[list_contents] Queues to List",
|
title="[list_contents] Queues to List",
|
||||||
description="Comma-separated list of exact Redis key names to list.",
|
description="Comma-separated list of exact Redis key names to list.",
|
||||||
@ -797,6 +857,21 @@ with DAG(
|
|||||||
title="[purge_celery_queue] Confirm Purge",
|
title="[purge_celery_queue] Confirm Purge",
|
||||||
description="Must be set to True to execute the 'purge_celery_queue' action. This is a destructive operation that removes all tasks from the specified Celery queue(s).",
|
description="Must be set to True to execute the 'purge_celery_queue' action. This is a destructive operation that removes all tasks from the specified Celery queue(s).",
|
||||||
),
|
),
|
||||||
|
# --- Params for 'clear_dag_runs' ---
|
||||||
|
"dag_id_to_manage": Param(
|
||||||
|
"ALL_DAGS",
|
||||||
|
type="string",
|
||||||
|
enum=["ALL_DAGS", "ytdlp_ops_v01_orchestrator", "ytdlp_ops_v01_dispatcher", "ytdlp_ops_v01_worker_per_url", "ytdlp_ops_v02_orchestrator_auth", "ytdlp_ops_v02_dispatcher_auth", "ytdlp_ops_v02_worker_per_url_auth", "ytdlp_ops_v02_orchestrator_dl", "ytdlp_ops_v02_dispatcher_dl", "ytdlp_ops_v02_worker_per_url_dl"],
|
||||||
|
title="[clear_dag_runs] DAG ID",
|
||||||
|
description="The DAG ID to perform the action on. Select 'ALL_DAGS' to clear history for all DAGs.",
|
||||||
|
),
|
||||||
|
"clear_scope": Param(
|
||||||
|
"all_runs",
|
||||||
|
type="string",
|
||||||
|
enum=["last_run", "all_runs"],
|
||||||
|
title="[clear_dag_runs] Clear Scope",
|
||||||
|
description="For 'clear_dag_runs' action, specifies the scope of runs to clear.",
|
||||||
|
),
|
||||||
# --- Common Params ---
|
# --- Common Params ---
|
||||||
"redis_conn_id": Param(
|
"redis_conn_id": Param(
|
||||||
DEFAULT_REDIS_CONN_ID,
|
DEFAULT_REDIS_CONN_ID,
|
||||||
@ -866,6 +941,11 @@ with DAG(
|
|||||||
python_callable=purge_celery_queue_callable,
|
python_callable=purge_celery_queue_callable,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
action_clear_dag_runs = PythonOperator(
|
||||||
|
task_id="action_clear_dag_runs",
|
||||||
|
python_callable=clear_dag_runs_callable,
|
||||||
|
)
|
||||||
|
|
||||||
# --- Wire up tasks ---
|
# --- Wire up tasks ---
|
||||||
branch_on_action >> [
|
branch_on_action >> [
|
||||||
action_add_videos,
|
action_add_videos,
|
||||||
@ -875,4 +955,5 @@ with DAG(
|
|||||||
action_requeue_failed,
|
action_requeue_failed,
|
||||||
action_inspect_celery_cluster,
|
action_inspect_celery_cluster,
|
||||||
action_purge_celery_queue,
|
action_purge_celery_queue,
|
||||||
|
action_clear_dag_runs,
|
||||||
]
|
]
|
||||||
|
|||||||
@ -20,7 +20,7 @@ from airflow.utils.dates import days_ago
|
|||||||
from airflow.api.common.trigger_dag import trigger_dag
|
from airflow.api.common.trigger_dag import trigger_dag
|
||||||
from airflow.models.dagrun import DagRun
|
from airflow.models.dagrun import DagRun
|
||||||
from airflow.models.dag import DagModel
|
from airflow.models.dag import DagModel
|
||||||
from datetime import timedelta
|
from datetime import timedelta, datetime
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
@ -37,41 +37,6 @@ from thrift.transport import TSocket, TTransport
|
|||||||
# Configure logging
|
# Configure logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
DEFAULT_REQUEST_PARAMS_JSON = """{
|
|
||||||
"context_reuse_policy": {
|
|
||||||
"enabled": true,
|
|
||||||
"max_age_seconds": 86400,
|
|
||||||
"reuse_visitor_id": true,
|
|
||||||
"reuse_cookies": true
|
|
||||||
},
|
|
||||||
"token_generation_strategy": {
|
|
||||||
"youtubei_js": {
|
|
||||||
"generate_po_token": true,
|
|
||||||
"generate_gvs_token": true
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"ytdlp_params": {
|
|
||||||
"use_curl_prefetch": false,
|
|
||||||
"token_supplement_strategy": {
|
|
||||||
"youtubepot_bgutilhttp_extractor": {
|
|
||||||
"enabled": true
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"visitor_id_override": {
|
|
||||||
"enabled": true
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"session_params": {
|
|
||||||
"lang": "en-US",
|
|
||||||
"location": "US",
|
|
||||||
"deviceCategory": "MOBILE",
|
|
||||||
"user_agents": {
|
|
||||||
"youtubei_js": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)",
|
|
||||||
"yt_dlp": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}"""
|
|
||||||
|
|
||||||
# Default settings
|
# Default settings
|
||||||
DEFAULT_QUEUE_NAME = 'video_queue'
|
DEFAULT_QUEUE_NAME = 'video_queue'
|
||||||
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
||||||
@ -191,6 +156,17 @@ def orchestrate_workers_ignition_callable(**context):
|
|||||||
dag_run_id = context['dag_run'].run_id
|
dag_run_id = context['dag_run'].run_id
|
||||||
total_triggered = 0
|
total_triggered = 0
|
||||||
|
|
||||||
|
# --- Generate a consistent timestamped prefix for this orchestrator run ---
|
||||||
|
# This ensures all workers spawned from this run use the same set of accounts.
|
||||||
|
final_account_pool_prefix = params['account_pool']
|
||||||
|
if params.get('prepend_client_to_account') and params.get('account_pool_size') is not None:
|
||||||
|
clients_str = params.get('clients', '')
|
||||||
|
primary_client = clients_str.split(',')[0].strip() if clients_str else 'unknown'
|
||||||
|
# Use a timestamp from the orchestrator's run for consistency
|
||||||
|
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
|
||||||
|
final_account_pool_prefix = f"{params['account_pool']}_{timestamp}_{primary_client}"
|
||||||
|
logger.info(f"Generated consistent account prefix for this run: '{final_account_pool_prefix}'")
|
||||||
|
|
||||||
for i, bunch in enumerate(bunches):
|
for i, bunch in enumerate(bunches):
|
||||||
logger.info(f"--- Triggering Bunch {i+1}/{len(bunches)} (contains {len(bunch)} dispatcher(s)) ---")
|
logger.info(f"--- Triggering Bunch {i+1}/{len(bunches)} (contains {len(bunch)} dispatcher(s)) ---")
|
||||||
for j, _ in enumerate(bunch):
|
for j, _ in enumerate(bunch):
|
||||||
@ -199,6 +175,8 @@ def orchestrate_workers_ignition_callable(**context):
|
|||||||
|
|
||||||
# Pass all orchestrator params to the dispatcher, which will then pass them to the worker.
|
# Pass all orchestrator params to the dispatcher, which will then pass them to the worker.
|
||||||
conf_to_pass = {p: params[p] for p in params}
|
conf_to_pass = {p: params[p] for p in params}
|
||||||
|
# Override account_pool with the generated prefix
|
||||||
|
conf_to_pass['account_pool'] = final_account_pool_prefix
|
||||||
|
|
||||||
logger.info(f"Triggering dispatcher {j+1}/{len(bunch)} in bunch {i+1} (run {total_triggered + 1}/{total_workers}) (Run ID: {run_id})")
|
logger.info(f"Triggering dispatcher {j+1}/{len(bunch)} in bunch {i+1} (run {total_triggered + 1}/{total_workers}) (Run ID: {run_id})")
|
||||||
logger.debug(f"Full conf for dispatcher run {run_id}: {conf_to_pass}")
|
logger.debug(f"Full conf for dispatcher run {run_id}: {conf_to_pass}")
|
||||||
@ -343,18 +321,13 @@ with DAG(
|
|||||||
"'proceed_loop': (Default) Mark URL as failed but continue the processing loop with a new URL. "
|
"'proceed_loop': (Default) Mark URL as failed but continue the processing loop with a new URL. "
|
||||||
"'retry_with_new_token': Attempt to get a new token with a new account and retry the download once. If it fails again, proceed loop."
|
"'retry_with_new_token': Attempt to get a new token with a new account and retry the download once. If it fails again, proceed loop."
|
||||||
),
|
),
|
||||||
'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."),
|
'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with per-request parameters to override server defaults. Can be a full JSON object or comma-separated key=value pairs (e.g., 'session_params.location=DE,ytdlp_params.skip_cache=true')."),
|
||||||
|
'language_code': Param('en-US', type="string", title="[Worker Param] Language Code", description="The language code (e.g., 'en-US', 'de-DE') to use for the YouTube request headers."),
|
||||||
'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."),
|
'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."),
|
||||||
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
|
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
|
||||||
'clients': Param(
|
'clients': Param(
|
||||||
'tv_simply',
|
'tv_simply',
|
||||||
type="string",
|
type="string",
|
||||||
enum=[
|
|
||||||
'tv_simply',
|
|
||||||
'mweb',
|
|
||||||
'tv',
|
|
||||||
'custom',
|
|
||||||
],
|
|
||||||
title="[Worker Param] Clients",
|
title="[Worker Param] Clients",
|
||||||
description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details."
|
description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details."
|
||||||
),
|
),
|
||||||
@ -370,23 +343,16 @@ with DAG(
|
|||||||
'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."),
|
'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."),
|
||||||
'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."),
|
'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."),
|
||||||
'skip_probe': Param(True, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
|
'skip_probe': Param(True, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
|
||||||
'yt_dlp_cleanup_mode': Param(True, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
|
'yt_dlp_cleanup_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
|
||||||
'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."),
|
'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."),
|
||||||
'download_format_preset': Param(
|
'download_format': Param(
|
||||||
'format_1',
|
'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
|
||||||
type="string",
|
type="string",
|
||||||
enum=['format_1', 'format_2', 'custom'],
|
title="[Worker Param] Download Format",
|
||||||
title="[Worker Param] Download Format Preset",
|
description="Custom yt-dlp format string. Common presets: [1] 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' (Default, best quality MP4). [2] '18-dashy/18,140-dashy/140,133-dashy/134-dashy/136-dashy/137-dashy/250-dashy/298-dashy/299-dashy' (Legacy formats). [3] '299-dashy/298-dashy/250-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy' (High-framerate formats)."
|
||||||
description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformat_1: 18-dashy/18,140-dashy/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformat_2: (299/298/137/136/135/134/133)-dashy"
|
|
||||||
),
|
|
||||||
'download_format_custom': Param(
|
|
||||||
'18-dashy/18,140-dashy/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy',
|
|
||||||
type="string",
|
|
||||||
title="[Worker Param] Custom Download Format",
|
|
||||||
description="Custom yt-dlp format string. Used when preset is 'custom'. E.g., 'ba[ext=m4a]/bestaudio/best'."
|
|
||||||
),
|
),
|
||||||
'downloader': Param(
|
'downloader': Param(
|
||||||
'py',
|
'cli',
|
||||||
type="string",
|
type="string",
|
||||||
enum=['py', 'aria-rpc', 'cli'],
|
enum=['py', 'aria-rpc', 'cli'],
|
||||||
title="[Worker Param] Download Tool",
|
title="[Worker Param] Download Tool",
|
||||||
@ -396,7 +362,7 @@ with DAG(
|
|||||||
'aria_port': Param(6800, type="integer", title="[Worker Param] Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_PORT'."),
|
'aria_port': Param(6800, type="integer", title="[Worker Param] Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_PORT'."),
|
||||||
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="[Worker Param] Aria2c Secret", description="For 'aria-rpc' downloader: Secret token. Can be set via Airflow Variable 'YTDLP_ARIA_SECRET'."),
|
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="[Worker Param] Aria2c Secret", description="For 'aria-rpc' downloader: Secret token. Can be set via Airflow Variable 'YTDLP_ARIA_SECRET'."),
|
||||||
'yt_dlp_extra_args': Param(
|
'yt_dlp_extra_args': Param(
|
||||||
'--no-resize-buffer --buffer-size 4M --min-sleep-interval 5 --max-sleep-interval 10',
|
'',
|
||||||
type=["string", "null"],
|
type=["string", "null"],
|
||||||
title="[Worker Param] Extra yt-dlp arguments",
|
title="[Worker Param] Extra yt-dlp arguments",
|
||||||
),
|
),
|
||||||
|
|||||||
@ -17,7 +17,7 @@ from __future__ import annotations
|
|||||||
from airflow.decorators import task, task_group
|
from airflow.decorators import task, task_group
|
||||||
from airflow.exceptions import AirflowException, AirflowSkipException
|
from airflow.exceptions import AirflowException, AirflowSkipException
|
||||||
from airflow.models import Variable
|
from airflow.models import Variable
|
||||||
from airflow.models.dag import DAG
|
from airflow.models.dag import DAG, DagModel
|
||||||
from airflow.models.param import Param
|
from airflow.models.param import Param
|
||||||
from airflow.models.xcom_arg import XComArg
|
from airflow.models.xcom_arg import XComArg
|
||||||
from airflow.operators.dummy import DummyOperator
|
from airflow.operators.dummy import DummyOperator
|
||||||
@ -174,14 +174,9 @@ def _get_account_pool(params: dict) -> list:
|
|||||||
is_prefix_mode = True
|
is_prefix_mode = True
|
||||||
pool_size = int(pool_size_param)
|
pool_size = int(pool_size_param)
|
||||||
|
|
||||||
if params.get('prepend_client_to_account', True):
|
# The orchestrator now generates the full prefix if prepend_client_to_account is True.
|
||||||
clients_str = params.get('clients', '')
|
# The worker just appends the numbers.
|
||||||
primary_client = clients_str.split(',')[0].strip() if clients_str else 'unknown'
|
accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
|
||||||
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
|
|
||||||
new_prefix = f"{prefix}_{timestamp}_{primary_client}"
|
|
||||||
accounts = [f"{new_prefix}_{i:02d}" for i in range(1, pool_size + 1)]
|
|
||||||
else:
|
|
||||||
accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
|
|
||||||
else:
|
else:
|
||||||
accounts = [prefix]
|
accounts = [prefix]
|
||||||
|
|
||||||
@ -258,12 +253,26 @@ def get_url_and_assign_account(**context):
|
|||||||
# For manual runs, we fall back to 'manual_url_to_process'.
|
# For manual runs, we fall back to 'manual_url_to_process'.
|
||||||
url_to_process = params.get('url_to_process')
|
url_to_process = params.get('url_to_process')
|
||||||
if not url_to_process:
|
if not url_to_process:
|
||||||
url_to_process = params.get('manual_url_to_process')
|
manual_url_input = params.get('manual_url_to_process')
|
||||||
if url_to_process:
|
if manual_url_input:
|
||||||
logger.info(f"Using URL from manual run parameter: '{url_to_process}'")
|
logger.info(f"Using URL from manual run parameter: '{manual_url_input}'")
|
||||||
|
if manual_url_input == 'PULL_FROM_QUEUE':
|
||||||
|
logger.info("Manual run is set to pull from queue.")
|
||||||
|
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
||||||
|
queue_name = params.get('queue_name', DEFAULT_QUEUE_NAME)
|
||||||
|
inbox_queue = f"{queue_name}_inbox"
|
||||||
|
client = _get_redis_client(redis_conn_id)
|
||||||
|
url_bytes = client.lpop(inbox_queue)
|
||||||
|
if not url_bytes:
|
||||||
|
logger.info("Redis queue is empty. No work to do. Skipping task.")
|
||||||
|
raise AirflowSkipException("Redis queue is empty. No work to do.")
|
||||||
|
url_to_process = url_bytes.decode('utf-8')
|
||||||
|
logger.info(f"Pulled URL '{url_to_process}' from queue '{inbox_queue}'.")
|
||||||
|
else:
|
||||||
|
url_to_process = manual_url_input
|
||||||
|
|
||||||
if not url_to_process:
|
if not url_to_process:
|
||||||
raise AirflowException("No URL to process. For manual runs, please provide a URL in the 'manual_url_to_process' parameter.")
|
raise AirflowException("No URL to process. For manual runs, please provide a URL in the 'manual_url_to_process' parameter, or 'PULL_FROM_QUEUE'.")
|
||||||
logger.info(f"Received URL '{url_to_process}' to process.")
|
logger.info(f"Received URL '{url_to_process}' to process.")
|
||||||
|
|
||||||
# Mark the URL as in-progress in Redis
|
# Mark the URL as in-progress in Redis
|
||||||
@ -310,9 +319,26 @@ def get_token(initial_data: dict, **context):
|
|||||||
host, port = params['service_ip'], int(params['service_port'])
|
host, port = params['service_ip'], int(params['service_port'])
|
||||||
machine_id = params.get('machine_id') or socket.gethostname()
|
machine_id = params.get('machine_id') or socket.gethostname()
|
||||||
clients = params.get('clients')
|
clients = params.get('clients')
|
||||||
request_params_json = params.get('request_params_json', '{}')
|
request_params_json = params.get('request_params_json')
|
||||||
|
language_code = params.get('language_code')
|
||||||
assigned_proxy_url = params.get('assigned_proxy_url')
|
assigned_proxy_url = params.get('assigned_proxy_url')
|
||||||
|
|
||||||
|
if language_code:
|
||||||
|
try:
|
||||||
|
params_dict = json.loads(request_params_json)
|
||||||
|
logger.info(f"Setting language for request: {language_code}")
|
||||||
|
if 'session_params' not in params_dict:
|
||||||
|
params_dict['session_params'] = {}
|
||||||
|
params_dict['session_params']['lang'] = language_code
|
||||||
|
request_params_json = json.dumps(params_dict)
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
logger.warning("Could not parse request_params_json as JSON. Treating as key=value pairs and appending language code.")
|
||||||
|
lang_kv = f"session_params.lang={language_code}"
|
||||||
|
if request_params_json:
|
||||||
|
request_params_json += f",{lang_kv}"
|
||||||
|
else:
|
||||||
|
request_params_json = lang_kv
|
||||||
|
|
||||||
video_id = _extract_video_id(url)
|
video_id = _extract_video_id(url)
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
job_dir_name = f"{timestamp}-{video_id or 'unknown'}"
|
job_dir_name = f"{timestamp}-{video_id or 'unknown'}"
|
||||||
@ -355,18 +381,39 @@ def get_token(initial_data: dict, **context):
|
|||||||
|
|
||||||
if process.returncode != 0:
|
if process.returncode != 0:
|
||||||
error_message = "ytops-client failed. See logs for details."
|
error_message = "ytops-client failed. See logs for details."
|
||||||
for line in reversed(process.stderr.strip().split('\n')):
|
# Try to find a more specific error message from the Thrift client's output
|
||||||
if 'ERROR' in line or 'Thrift error' in line or 'Connection to server failed' in line:
|
thrift_error_match = re.search(r'A Thrift error occurred: (.*)', process.stderr)
|
||||||
error_message = line.strip()
|
if thrift_error_match:
|
||||||
break
|
error_message = thrift_error_match.group(1).strip()
|
||||||
|
else: # Fallback to old line-by-line parsing
|
||||||
|
for line in reversed(process.stderr.strip().split('\n')):
|
||||||
|
if 'ERROR' in line or 'Thrift error' in line or 'Connection to server failed' in line:
|
||||||
|
error_message = line.strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
# Determine error code for branching logic
|
||||||
error_code = 'GET_INFO_CLIENT_FAIL'
|
error_code = 'GET_INFO_CLIENT_FAIL'
|
||||||
if "BOT_DETECTED" in process.stderr:
|
stderr_lower = process.stderr.lower()
|
||||||
error_code = "BOT_DETECTED"
|
|
||||||
elif "BOT_DETECTION_SIGN_IN_REQUIRED" in process.stderr:
|
# These patterns should match the error codes from PBUserException and others
|
||||||
error_code = "BOT_DETECTION_SIGN_IN_REQUIRED"
|
error_patterns = {
|
||||||
elif "Connection to server failed" in process.stderr:
|
"BOT_DETECTED": ["bot_detected"],
|
||||||
error_code = "TRANSPORT_ERROR"
|
"BOT_DETECTION_SIGN_IN_REQUIRED": ["bot_detection_sign_in_required"],
|
||||||
|
"TRANSPORT_ERROR": ["connection to server failed"],
|
||||||
|
"PRIVATE_VIDEO": ["private video"],
|
||||||
|
"COPYRIGHT_REMOVAL": ["copyright"],
|
||||||
|
"GEO_RESTRICTED": ["in your country"],
|
||||||
|
"VIDEO_REMOVED": ["video has been removed"],
|
||||||
|
"VIDEO_UNAVAILABLE": ["video unavailable"],
|
||||||
|
"MEMBERS_ONLY": ["members-only"],
|
||||||
|
"AGE_GATED_SIGN_IN": ["sign in to confirm your age"],
|
||||||
|
"VIDEO_PROCESSING": ["processing this video"],
|
||||||
|
}
|
||||||
|
|
||||||
|
for code, patterns in error_patterns.items():
|
||||||
|
if any(p in stderr_lower for p in patterns):
|
||||||
|
error_code = code
|
||||||
|
break # Found a match, stop searching
|
||||||
|
|
||||||
error_details = {
|
error_details = {
|
||||||
'error_message': error_message,
|
'error_message': error_message,
|
||||||
@ -381,8 +428,23 @@ def get_token(initial_data: dict, **context):
|
|||||||
if proxy_match:
|
if proxy_match:
|
||||||
proxy = proxy_match.group(1).strip()
|
proxy = proxy_match.group(1).strip()
|
||||||
|
|
||||||
|
# Rename the info.json to include the proxy for the download worker
|
||||||
|
final_info_json_path = info_json_path
|
||||||
|
if proxy:
|
||||||
|
# Sanitize for filename: replace '://' which is invalid in paths. Colons are usually fine.
|
||||||
|
sanitized_proxy = proxy.replace('://', '---')
|
||||||
|
|
||||||
|
new_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}_proxy_{sanitized_proxy}.json"
|
||||||
|
new_path = os.path.join(job_dir_path, new_filename)
|
||||||
|
try:
|
||||||
|
os.rename(info_json_path, new_path)
|
||||||
|
final_info_json_path = new_path
|
||||||
|
logger.info(f"Renamed info.json to include proxy: {new_path}")
|
||||||
|
except OSError as e:
|
||||||
|
logger.error(f"Failed to rename info.json to include proxy: {e}. Using original path.")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'info_json_path': info_json_path,
|
'info_json_path': final_info_json_path,
|
||||||
'job_dir_path': job_dir_path,
|
'job_dir_path': job_dir_path,
|
||||||
'socks_proxy': proxy,
|
'socks_proxy': proxy,
|
||||||
'ytdlp_command': None,
|
'ytdlp_command': None,
|
||||||
@ -407,10 +469,15 @@ def handle_bannable_error_branch(task_id_to_check: str, **context):
|
|||||||
error_code = error_details.get('error_code', '').strip()
|
error_code = error_details.get('error_code', '').strip()
|
||||||
policy = params.get('on_auth_failure', 'retry_with_new_account')
|
policy = params.get('on_auth_failure', 'retry_with_new_account')
|
||||||
|
|
||||||
# Check if this is an age confirmation error - should not stop the loop
|
# Unrecoverable video errors that should not be retried or treated as system failures.
|
||||||
if "Sign in to confirm your age" in error_message or "confirm your age" in error_message.lower():
|
unrecoverable_video_errors = [
|
||||||
logger.info(f"Age confirmation error detected for '{task_id_to_check}'. This is a content restriction, not a bot detection issue.")
|
"AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "COPYRIGHT_REMOVAL",
|
||||||
return 'handle_age_restriction_error'
|
"GEO_RESTRICTED", "VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED"
|
||||||
|
]
|
||||||
|
|
||||||
|
if error_code in unrecoverable_video_errors:
|
||||||
|
logger.warning(f"Unrecoverable video error '{error_code}' detected for '{task_id_to_check}'. This is a content issue, not a system failure.")
|
||||||
|
return 'handle_unrecoverable_video_error'
|
||||||
|
|
||||||
# Fatal Thrift connection errors that should stop all processing.
|
# Fatal Thrift connection errors that should stop all processing.
|
||||||
if error_code == 'TRANSPORT_ERROR':
|
if error_code == 'TRANSPORT_ERROR':
|
||||||
@ -646,6 +713,65 @@ def list_available_formats(token_data: dict, **context):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_generic_selector(selector: str, info_json_path: str, logger) -> str | list[str] | None:
|
||||||
|
"""
|
||||||
|
Uses yt-dlp to resolve a generic format selector into specific, numeric format ID(s).
|
||||||
|
Returns a numeric selector string (e.g., '18'), a list of IDs for '+' selectors
|
||||||
|
(e.g., ['299', '140']), or None if resolution fails.
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
import shlex
|
||||||
|
|
||||||
|
try:
|
||||||
|
cmd = [
|
||||||
|
'yt-dlp',
|
||||||
|
'--print', 'format_id',
|
||||||
|
'-f', selector,
|
||||||
|
'--load-info-json', info_json_path,
|
||||||
|
]
|
||||||
|
|
||||||
|
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
|
||||||
|
logger.info(f"Resolving generic selector '{selector}' with command: {copy_paste_cmd}")
|
||||||
|
|
||||||
|
process = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
||||||
|
|
||||||
|
if process.stderr:
|
||||||
|
# yt-dlp often prints warnings to stderr that are not fatal.
|
||||||
|
# e.g., "Requested format selector '...' contains no available formats"
|
||||||
|
logger.info(f"yt-dlp resolver STDERR for selector '{selector}':\n{process.stderr}")
|
||||||
|
|
||||||
|
if process.returncode != 0:
|
||||||
|
logger.error(f"yt-dlp resolver for selector '{selector}' failed with exit code {process.returncode}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
output_ids = process.stdout.strip().split('\n')
|
||||||
|
output_ids = [fid for fid in output_ids if fid] # Remove empty lines
|
||||||
|
|
||||||
|
if not output_ids:
|
||||||
|
logger.warning(f"Selector '{selector}' resolved to no format IDs.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# yt-dlp might return '137+140' on one line, or '137\n140' on multiple.
|
||||||
|
# We need to handle both to get individual IDs.
|
||||||
|
final_ids = []
|
||||||
|
for fid in output_ids:
|
||||||
|
final_ids.extend(fid.split('+'))
|
||||||
|
|
||||||
|
# If the original selector was for merging (contained '+'), return individual IDs for separate downloads.
|
||||||
|
# Otherwise, yt-dlp has already chosen the best one from a fallback list, so we just use it.
|
||||||
|
if '+' in selector:
|
||||||
|
resolved_selector = final_ids
|
||||||
|
else:
|
||||||
|
resolved_selector = final_ids[0] # yt-dlp gives the single best choice
|
||||||
|
|
||||||
|
logger.info(f"Successfully resolved selector '{selector}' to '{resolved_selector}'.")
|
||||||
|
return resolved_selector
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"An error occurred while resolving selector '{selector}': {e}", exc_info=True)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
@task
|
@task
|
||||||
def download_and_probe(token_data: dict, available_formats: list[str], **context):
|
def download_and_probe(token_data: dict, available_formats: list[str], **context):
|
||||||
"""
|
"""
|
||||||
@ -660,26 +786,33 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
|||||||
try:
|
try:
|
||||||
params = context['params']
|
params = context['params']
|
||||||
info_json_path = token_data.get('info_json_path')
|
info_json_path = token_data.get('info_json_path')
|
||||||
proxy = token_data.get('socks_proxy')
|
|
||||||
original_url = token_data.get('original_url')
|
original_url = token_data.get('original_url')
|
||||||
|
|
||||||
|
# Extract proxy from filename, with fallback to token_data for backward compatibility
|
||||||
|
proxy = None
|
||||||
|
if info_json_path:
|
||||||
|
filename = os.path.basename(info_json_path)
|
||||||
|
proxy_match = re.search(r'_proxy_(.+)\.json$', filename)
|
||||||
|
if proxy_match:
|
||||||
|
sanitized_proxy = proxy_match.group(1)
|
||||||
|
# Reverse sanitization from auth worker (replace '---' with '://')
|
||||||
|
proxy = sanitized_proxy.replace('---', '://')
|
||||||
|
logger.info(f"Extracted proxy '{proxy}' from filename.")
|
||||||
|
|
||||||
|
if not proxy:
|
||||||
|
logger.warning("Proxy not found in filename. Falling back to 'socks_proxy' from token_data.")
|
||||||
|
proxy = token_data.get('socks_proxy')
|
||||||
|
|
||||||
download_dir = token_data.get('job_dir_path')
|
download_dir = token_data.get('job_dir_path')
|
||||||
if not download_dir:
|
if not download_dir:
|
||||||
# Fallback for older runs or if job_dir_path is missing
|
# Fallback for older runs or if job_dir_path is missing
|
||||||
download_dir = os.path.dirname(info_json_path)
|
download_dir = os.path.dirname(info_json_path)
|
||||||
|
|
||||||
format_preset = params.get('download_format_preset', 'format_1')
|
download_format = params.get('download_format')
|
||||||
if format_preset == 'custom':
|
if not download_format:
|
||||||
download_format = params.get('download_format_custom')
|
raise AirflowException("The 'download_format' parameter is missing or empty.")
|
||||||
if not download_format:
|
|
||||||
raise AirflowException("Format preset is 'custom' but no custom format string was provided.")
|
|
||||||
elif format_preset == 'format_1':
|
|
||||||
download_format = '18-dashy/18,140-dashy/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy'
|
|
||||||
elif format_preset == 'format_2':
|
|
||||||
download_format = '(299/298/137/136/135/134/133)-dashy'
|
|
||||||
else:
|
|
||||||
download_format = '18-dashy/18,140-dashy/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy'
|
|
||||||
|
|
||||||
output_template = params.get('output_path_template', "%(title)s [%(id)s].f%(format_id)s.%(ext)s")
|
output_template = params.get('output_path_template', "%(id)s.f%(format_id)s.%(ext)s")
|
||||||
full_output_path = os.path.join(download_dir, output_template)
|
full_output_path = os.path.join(download_dir, output_template)
|
||||||
retry_on_probe_failure = params.get('retry_on_probe_failure', False)
|
retry_on_probe_failure = params.get('retry_on_probe_failure', False)
|
||||||
|
|
||||||
@ -706,15 +839,16 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
|||||||
downloader = params.get('downloader', 'py')
|
downloader = params.get('downloader', 'py')
|
||||||
cmd = ['ytops-client', 'download', downloader, '--load-info-json', info_json_path, '-f', format_selector]
|
cmd = ['ytops-client', 'download', downloader, '--load-info-json', info_json_path, '-f', format_selector]
|
||||||
|
|
||||||
if proxy:
|
|
||||||
cmd.extend(['--proxy', proxy])
|
|
||||||
|
|
||||||
if downloader == 'py':
|
if downloader == 'py':
|
||||||
|
if proxy:
|
||||||
|
cmd.extend(['--proxy', proxy])
|
||||||
cmd.extend(['--output-dir', download_dir])
|
cmd.extend(['--output-dir', download_dir])
|
||||||
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
|
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
|
||||||
|
|
||||||
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
|
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
|
||||||
py_extra_args = []
|
py_extra_args = ['--output', output_template, '--no-resize-buffer', '--buffer-size', '4M']
|
||||||
|
if params.get('fragment_retries'):
|
||||||
|
py_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])])
|
||||||
if params.get('socket_timeout'):
|
if params.get('socket_timeout'):
|
||||||
py_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
|
py_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
|
||||||
if params.get('yt_dlp_test_mode'):
|
if params.get('yt_dlp_test_mode'):
|
||||||
@ -727,12 +861,29 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
|||||||
cmd.extend(['--extra-ytdlp-args', final_extra_args_str])
|
cmd.extend(['--extra-ytdlp-args', final_extra_args_str])
|
||||||
|
|
||||||
elif downloader == 'aria-rpc':
|
elif downloader == 'aria-rpc':
|
||||||
|
# For aria2c running on the host, the proxy (if also on the host) should be referenced via localhost.
|
||||||
|
# The user-agent is set by yt-dlp's extractor, not directly here. The default is Cobalt-based.
|
||||||
|
if proxy:
|
||||||
|
proxy_port_match = re.search(r':(\d+)$', proxy)
|
||||||
|
if proxy_port_match:
|
||||||
|
proxy_port = proxy_port_match.group(1)
|
||||||
|
aria_proxy = f"socks5://127.0.0.1:{proxy_port}"
|
||||||
|
cmd.extend(['--proxy', aria_proxy])
|
||||||
|
logger.info(f"Using translated proxy for host-based aria2c: {aria_proxy}")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Could not parse port from proxy '{proxy}'. Passing it to aria2c as-is.")
|
||||||
|
cmd.extend(['--proxy', proxy])
|
||||||
|
|
||||||
|
# The remote-dir is the path relative to aria2c's working directory on the host.
|
||||||
|
# The output-dir is the container's local path to the same shared volume.
|
||||||
|
remote_dir = os.path.relpath(download_dir, '/opt/airflow/downloadfiles/videos')
|
||||||
cmd.extend([
|
cmd.extend([
|
||||||
'--aria-host', params.get('aria_host', '172.17.0.1'),
|
'--aria-host', params.get('aria_host', '172.17.0.1'),
|
||||||
'--aria-port', str(params.get('aria_port', 6800)),
|
'--aria-port', str(params.get('aria_port', 6800)),
|
||||||
'--aria-secret', params.get('aria_secret'),
|
'--aria-secret', params.get('aria_secret'),
|
||||||
'--wait',
|
'--wait',
|
||||||
'--output-dir', download_dir,
|
'--output-dir', download_dir,
|
||||||
|
'--remote-dir', remote_dir,
|
||||||
])
|
])
|
||||||
if 'dashy' in format_selector:
|
if 'dashy' in format_selector:
|
||||||
cmd.extend([
|
cmd.extend([
|
||||||
@ -743,9 +894,15 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
|||||||
cmd.append('--cleanup')
|
cmd.append('--cleanup')
|
||||||
|
|
||||||
elif downloader == 'cli':
|
elif downloader == 'cli':
|
||||||
cmd.extend(['--output-dir', download_dir])
|
# Overwrite cmd to call yt-dlp directly
|
||||||
|
cmd = ['yt-dlp', '--load-info-json', info_json_path, '-f', format_selector]
|
||||||
|
if proxy:
|
||||||
|
cmd.extend(['--proxy', proxy])
|
||||||
|
|
||||||
# The 'cli' tool is the old yt-dlp wrapper, so it takes similar arguments.
|
# The 'cli' tool is the old yt-dlp wrapper, so it takes similar arguments.
|
||||||
cli_extra_args = []
|
cli_extra_args = ['--output', full_output_path, '--no-resize-buffer', '--buffer-size', '4M']
|
||||||
|
if params.get('fragment_retries'):
|
||||||
|
cli_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])])
|
||||||
if params.get('socket_timeout'):
|
if params.get('socket_timeout'):
|
||||||
cli_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
|
cli_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
|
||||||
if params.get('yt_dlp_test_mode'):
|
if params.get('yt_dlp_test_mode'):
|
||||||
@ -754,11 +911,12 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
|||||||
existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '')
|
existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '')
|
||||||
final_extra_args = existing_extra + cli_extra_args
|
final_extra_args = existing_extra + cli_extra_args
|
||||||
if final_extra_args:
|
if final_extra_args:
|
||||||
cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)])
|
cmd.extend(final_extra_args)
|
||||||
|
|
||||||
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
|
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
|
||||||
logger.info(f"--- Preparing to execute ytops-client ---")
|
tool_name = 'yt-dlp' if downloader == 'cli' else 'ytops-client'
|
||||||
logger.info(f"Full ytops-client command for format '{format_selector}':")
|
logger.info(f"--- Preparing to execute {tool_name} ---")
|
||||||
|
logger.info(f"Full {tool_name} command for format '{format_selector}':")
|
||||||
logger.info(copy_paste_cmd)
|
logger.info(copy_paste_cmd)
|
||||||
logger.info(f"-----------------------------------------")
|
logger.info(f"-----------------------------------------")
|
||||||
process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
|
process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
|
||||||
@ -768,23 +926,44 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
|||||||
if process.stderr:
|
if process.stderr:
|
||||||
logger.info(f"Download tool STDERR for format '{format_selector}':\n{process.stderr}")
|
logger.info(f"Download tool STDERR for format '{format_selector}':\n{process.stderr}")
|
||||||
|
|
||||||
if process.returncode != 0:
|
if process.returncode != 0 or "ERROR:" in process.stderr:
|
||||||
logger.error(f"Download tool failed for format '{format_selector}' with exit code {process.returncode}")
|
logger.error(f"Download tool failed for format '{format_selector}' with exit code {process.returncode}")
|
||||||
raise AirflowException(f"Download command failed for format '{format_selector}'. See logs for details.")
|
if "ERROR:" in process.stderr and process.returncode == 0:
|
||||||
|
logger.error("Detected 'ERROR:' in stderr, treating as failure despite exit code 0.")
|
||||||
|
|
||||||
|
# Pass stderr in the exception for better parsing in the outer try/except block
|
||||||
|
raise AirflowException(f"Download command failed for format '{format_selector}'. Stderr: {process.stderr}")
|
||||||
|
|
||||||
output_files = []
|
output_files = []
|
||||||
for line in process.stdout.strip().split('\n'):
|
if downloader == 'cli':
|
||||||
# For aria-rpc, parse "Download and merge successful: <path>" or "Download successful: <path>"
|
# Parse yt-dlp's verbose output to find the final filename
|
||||||
match = re.search(r'successful: (.+)', line)
|
final_filename = None
|
||||||
if match:
|
for line in process.stdout.strip().split('\n'):
|
||||||
filepath = match.group(1).strip()
|
# Case 1: Simple download, no merge
|
||||||
if os.path.exists(filepath):
|
dest_match = re.search(r'\[download\] Destination: (.*)', line)
|
||||||
output_files.append(filepath)
|
if dest_match:
|
||||||
else:
|
final_filename = dest_match.group(1).strip()
|
||||||
logger.warning(f"File path from aria-rpc output does not exist locally: '{filepath}'")
|
|
||||||
# For py/cli, it's just the path
|
# Case 2: Formats are merged into a new file. This path is absolute if -o is absolute.
|
||||||
elif os.path.exists(line.strip()):
|
merge_match = re.search(r'\[Merger\] Merging formats into "(.*)"', line)
|
||||||
output_files.append(line.strip())
|
if merge_match:
|
||||||
|
final_filename = merge_match.group(1).strip()
|
||||||
|
|
||||||
|
if final_filename and os.path.exists(final_filename):
|
||||||
|
output_files.append(final_filename)
|
||||||
|
else: # Logic for 'py' and 'aria-rpc'
|
||||||
|
for line in process.stdout.strip().split('\n'):
|
||||||
|
# For aria-rpc, parse "Download and merge successful: <path>" or "Download successful: <path>"
|
||||||
|
match = re.search(r'successful: (.+)', line)
|
||||||
|
if match:
|
||||||
|
filepath = match.group(1).strip()
|
||||||
|
if os.path.exists(filepath):
|
||||||
|
output_files.append(filepath)
|
||||||
|
else:
|
||||||
|
logger.warning(f"File path from aria-rpc output does not exist locally: '{filepath}'")
|
||||||
|
# For py, it's just the path
|
||||||
|
elif os.path.exists(line.strip()):
|
||||||
|
output_files.append(line.strip())
|
||||||
|
|
||||||
if not params.get('yt_dlp_test_mode') and not output_files:
|
if not params.get('yt_dlp_test_mode') and not output_files:
|
||||||
raise AirflowException(f"Download for format '{format_selector}' finished but no output files were found or exist.")
|
raise AirflowException(f"Download for format '{format_selector}' finished but no output files were found or exist.")
|
||||||
@ -797,7 +976,7 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
|||||||
"""Probes a file with ffmpeg to check for corruption."""
|
"""Probes a file with ffmpeg to check for corruption."""
|
||||||
logger.info(f"Probing downloaded file: {filename}")
|
logger.info(f"Probing downloaded file: {filename}")
|
||||||
try:
|
try:
|
||||||
subprocess.run(['ffmpeg', '-v', 'error', '-i', filename, '-f', 'null', '-'], check=True, capture_output=True, text=True)
|
subprocess.run(['ffmpeg', '-v', 'error', '-sseof', '-10', '-i', filename, '-c', 'copy', '-f', 'null', '-'], check=True, capture_output=True, text=True)
|
||||||
logger.info(f"SUCCESS: Probe confirmed valid media file: {filename}")
|
logger.info(f"SUCCESS: Probe confirmed valid media file: {filename}")
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
logger.error(f"ffmpeg probe failed for '{filename}'. File may be corrupt.")
|
logger.error(f"ffmpeg probe failed for '{filename}'. File may be corrupt.")
|
||||||
@ -864,30 +1043,58 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
|||||||
if not formats_to_download_initial:
|
if not formats_to_download_initial:
|
||||||
raise AirflowException("No valid download format selectors were found after parsing.")
|
raise AirflowException("No valid download format selectors were found after parsing.")
|
||||||
|
|
||||||
# --- Filter requested formats against available formats ---
|
# --- Filter and resolve requested formats ---
|
||||||
final_formats_to_download = []
|
final_formats_to_download = []
|
||||||
if not available_formats:
|
if not available_formats:
|
||||||
logger.warning("List of available formats is empty. Will attempt to download all requested formats without validation.")
|
logger.warning("List of available formats is empty. Cannot validate numeric selectors, but will attempt to resolve generic selectors.")
|
||||||
final_formats_to_download = formats_to_download_initial
|
|
||||||
else:
|
for selector in formats_to_download_initial:
|
||||||
for selector in formats_to_download_initial:
|
# A selector is considered generic if it contains keywords like 'best' or filter brackets '[]'.
|
||||||
# A selector can be '140' or '299/298/137' or '140-dashy'
|
is_generic = bool(re.search(r'(best|\[|\])', selector))
|
||||||
|
|
||||||
|
if is_generic:
|
||||||
|
resolved_selector = _resolve_generic_selector(selector, info_json_path, logger)
|
||||||
|
if resolved_selector:
|
||||||
|
# The resolver returns a list for '+' selectors, or a string for others.
|
||||||
|
resolved_formats = resolved_selector if isinstance(resolved_selector, list) else [resolved_selector]
|
||||||
|
|
||||||
|
for res_format in resolved_formats:
|
||||||
|
# Prefer -dashy version if available and the format is a simple numeric ID
|
||||||
|
if res_format.isdigit() and f"{res_format}-dashy" in available_formats:
|
||||||
|
final_format = f"{res_format}-dashy"
|
||||||
|
logger.info(f"Resolved format '{res_format}' from selector '{selector}'. Preferred '-dashy' version: '{final_format}'.")
|
||||||
|
else:
|
||||||
|
final_format = res_format
|
||||||
|
|
||||||
|
# Validate the chosen format against available formats
|
||||||
|
if available_formats:
|
||||||
|
individual_ids = re.split(r'[/+]', final_format)
|
||||||
|
is_available = any(fid in available_formats for fid in individual_ids)
|
||||||
|
|
||||||
|
if is_available:
|
||||||
|
final_formats_to_download.append(final_format)
|
||||||
|
else:
|
||||||
|
logger.warning(f"Resolved format '{final_format}' (from '{selector}') contains no available formats. Skipping.")
|
||||||
|
else:
|
||||||
|
# Cannot validate, so we trust the resolver's output.
|
||||||
|
final_formats_to_download.append(final_format)
|
||||||
|
else:
|
||||||
|
logger.warning(f"Could not resolve generic selector '{selector}' using yt-dlp. Skipping.")
|
||||||
|
else:
|
||||||
|
# This is a numeric-based selector (e.g., '140' or '299/298' or '140-dashy').
|
||||||
|
# Validate it against the available formats.
|
||||||
|
if not available_formats:
|
||||||
|
logger.warning(f"Cannot validate numeric selector '{selector}' because available formats list is empty. Assuming it's valid.")
|
||||||
|
final_formats_to_download.append(selector)
|
||||||
|
continue
|
||||||
|
|
||||||
individual_ids = re.split(r'[/+]', selector)
|
individual_ids = re.split(r'[/+]', selector)
|
||||||
|
is_available = any(fid in available_formats for fid in individual_ids)
|
||||||
# Extract the numeric part of the format ID for checking against available_formats
|
|
||||||
is_available = False
|
|
||||||
for fid in individual_ids:
|
|
||||||
numeric_id_match = re.match(r'^\d+', fid)
|
|
||||||
if numeric_id_match:
|
|
||||||
numeric_id = numeric_id_match.group(0)
|
|
||||||
if numeric_id in available_formats:
|
|
||||||
is_available = True
|
|
||||||
break # Found a match, no need to check other parts of the selector
|
|
||||||
|
|
||||||
if is_available:
|
if is_available:
|
||||||
final_formats_to_download.append(selector)
|
final_formats_to_download.append(selector)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Requested format selector '{selector}' contains no available formats. Skipping.")
|
logger.warning(f"Requested numeric format selector '{selector}' contains no available formats. Skipping.")
|
||||||
|
|
||||||
if not final_formats_to_download:
|
if not final_formats_to_download:
|
||||||
raise AirflowException("None of the requested formats are available for this video.")
|
raise AirflowException("None of the requested formats are available for this video.")
|
||||||
@ -909,6 +1116,11 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
|||||||
|
|
||||||
logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...")
|
logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...")
|
||||||
|
|
||||||
|
delay_between_formats = params.get('delay_between_formats_s', 0)
|
||||||
|
if delay_between_formats > 0:
|
||||||
|
logger.info(f"Waiting {delay_between_formats}s before re-download attempt...")
|
||||||
|
time.sleep(delay_between_formats)
|
||||||
|
|
||||||
format_ids_to_retry = []
|
format_ids_to_retry = []
|
||||||
# Since each download is now for a specific selector and the output template
|
# Since each download is now for a specific selector and the output template
|
||||||
# includes the format_id, we can always attempt to extract the format_id
|
# includes the format_id, we can always attempt to extract the format_id
|
||||||
@ -945,7 +1157,7 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
|||||||
if not final_success_list:
|
if not final_success_list:
|
||||||
raise AirflowException("Download and probe process completed but produced no valid files.")
|
raise AirflowException("Download and probe process completed but produced no valid files.")
|
||||||
|
|
||||||
if params.get('yt_dlp_cleanup_mode', True):
|
if params.get('yt_dlp_cleanup_mode', False):
|
||||||
logger.info(f"Cleanup mode is enabled. Creating .empty files and deleting originals for {len(final_success_list)} files.")
|
logger.info(f"Cleanup mode is enabled. Creating .empty files and deleting originals for {len(final_success_list)} files.")
|
||||||
for f in final_success_list:
|
for f in final_success_list:
|
||||||
try:
|
try:
|
||||||
@ -965,6 +1177,26 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
|||||||
if not video_id:
|
if not video_id:
|
||||||
logger.error(f"Could not extract video_id from URL '{original_url}' for final move. Skipping.")
|
logger.error(f"Could not extract video_id from URL '{original_url}' for final move. Skipping.")
|
||||||
else:
|
else:
|
||||||
|
# --- Rename info.json to a simple format before moving ---
|
||||||
|
path_to_info_json_for_move = info_json_path # Default to original path
|
||||||
|
try:
|
||||||
|
# info_json_path is the full path to the original info.json
|
||||||
|
if info_json_path and os.path.exists(info_json_path):
|
||||||
|
new_info_json_name = f"info_{video_id}.json"
|
||||||
|
new_info_json_path = os.path.join(os.path.dirname(info_json_path), new_info_json_name)
|
||||||
|
|
||||||
|
if info_json_path != new_info_json_path:
|
||||||
|
logger.info(f"Renaming '{info_json_path}' to '{new_info_json_path}' for final delivery.")
|
||||||
|
os.rename(info_json_path, new_info_json_path)
|
||||||
|
path_to_info_json_for_move = new_info_json_path
|
||||||
|
else:
|
||||||
|
logger.info("info.json already has the simple name. No rename needed.")
|
||||||
|
else:
|
||||||
|
logger.warning("Could not find info.json to rename before moving.")
|
||||||
|
except Exception as rename_e:
|
||||||
|
logger.error(f"Failed to rename info.json before move: {rename_e}", exc_info=True)
|
||||||
|
# --- End of rename logic ---
|
||||||
|
|
||||||
source_dir = download_dir # This is the job_dir_path
|
source_dir = download_dir # This is the job_dir_path
|
||||||
|
|
||||||
# Group downloads into 10-minute batch folders based on completion time.
|
# Group downloads into 10-minute batch folders based on completion time.
|
||||||
@ -982,18 +1214,65 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
|||||||
logger.warning(f"Destination '{final_dir_path}' already exists. It will be removed and replaced.")
|
logger.warning(f"Destination '{final_dir_path}' already exists. It will be removed and replaced.")
|
||||||
shutil.rmtree(final_dir_path)
|
shutil.rmtree(final_dir_path)
|
||||||
|
|
||||||
os.rename(source_dir, final_dir_path)
|
# Create the destination directory and move only the essential files, then clean up the source.
|
||||||
logger.info(f"Successfully moved job to '{final_dir_path}'.")
|
# This ensures no temporary or junk files are carried over.
|
||||||
|
os.makedirs(final_dir_path)
|
||||||
|
|
||||||
|
# 1. Move the info.json file
|
||||||
|
if path_to_info_json_for_move and os.path.exists(path_to_info_json_for_move):
|
||||||
|
shutil.move(path_to_info_json_for_move, final_dir_path)
|
||||||
|
logger.info(f"Moved '{os.path.basename(path_to_info_json_for_move)}' to destination.")
|
||||||
|
|
||||||
|
# 2. Move the media files (or their .empty placeholders)
|
||||||
|
files_to_move = []
|
||||||
|
if params.get('yt_dlp_cleanup_mode', False):
|
||||||
|
files_to_move = [f"{f}.empty" for f in final_success_list]
|
||||||
|
else:
|
||||||
|
files_to_move = final_success_list
|
||||||
|
|
||||||
|
for f in files_to_move:
|
||||||
|
if os.path.exists(f):
|
||||||
|
shutil.move(f, final_dir_path)
|
||||||
|
logger.info(f"Moved '{os.path.basename(f)}' to destination.")
|
||||||
|
else:
|
||||||
|
logger.warning(f"File '{f}' expected but not found for moving.")
|
||||||
|
|
||||||
|
# 3. Clean up the original source directory
|
||||||
|
logger.info(f"Cleaning up original source directory '{source_dir}'")
|
||||||
|
shutil.rmtree(source_dir)
|
||||||
|
logger.info(f"Successfully moved job to '{final_dir_path}' and cleaned up source.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to move completed job directory: {e}", exc_info=True)
|
logger.error(f"Failed to move completed job directory: {e}", exc_info=True)
|
||||||
# Do not fail the task for a move error, just log it.
|
# Do not fail the task for a move error, just log it.
|
||||||
|
|
||||||
return final_success_list
|
return final_success_list
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if 'HTTP Error 403: Forbidden' in str(e):
|
ti = context['task_instance']
|
||||||
logger.warning("Detected 'HTTP Error 403: Forbidden' in download error. Pushing details to XCom for branching.")
|
error_message = str(e)
|
||||||
ti = context['task_instance']
|
error_code = "DOWNLOAD_FAILED"
|
||||||
ti.xcom_push(key='download_error_details', value={'error_code': 'HTTP_403_FORBIDDEN', 'error_message': str(e)})
|
msg_lower = error_message.lower()
|
||||||
|
|
||||||
|
unrecoverable_patterns = {
|
||||||
|
"AGE_GATED_SIGN_IN": ['sign in to confirm your age'],
|
||||||
|
"MEMBERS_ONLY": ['members-only content'],
|
||||||
|
"VIDEO_PROCESSING": ['processing this video'],
|
||||||
|
"COPYRIGHT_REMOVAL": ['copyright'],
|
||||||
|
"GEO_RESTRICTED": ['in your country'],
|
||||||
|
"PRIVATE_VIDEO": ['private video'],
|
||||||
|
"VIDEO_REMOVED": ['video has been removed'],
|
||||||
|
"VIDEO_UNAVAILABLE": ['video unavailable'],
|
||||||
|
"HTTP_403_FORBIDDEN": ['http error 403: forbidden']
|
||||||
|
}
|
||||||
|
|
||||||
|
for code, patterns in unrecoverable_patterns.items():
|
||||||
|
if any(p in msg_lower for p in patterns):
|
||||||
|
error_code = code
|
||||||
|
break
|
||||||
|
|
||||||
|
# Always push details to XCom for the branch operator to inspect.
|
||||||
|
error_details = {'error_code': error_code, 'error_message': error_message}
|
||||||
|
ti.xcom_push(key='download_error_details', value=error_details)
|
||||||
|
|
||||||
raise AirflowException(f"Download and probe failed: {e}") from e
|
raise AirflowException(f"Download and probe failed: {e}") from e
|
||||||
|
|
||||||
@task
|
@task
|
||||||
@ -1256,6 +1535,12 @@ def continue_processing_loop(**context):
|
|||||||
logger.info(f"DAG run '{dag_run.run_id}' does not appear to be triggered by the dispatcher. Stopping processing loop.")
|
logger.info(f"DAG run '{dag_run.run_id}' does not appear to be triggered by the dispatcher. Stopping processing loop.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
dispatcher_dag_id = 'ytdlp_ops_v01_dispatcher'
|
||||||
|
dag_model = DagModel.get_dagmodel(dispatcher_dag_id)
|
||||||
|
if dag_model and dag_model.is_paused:
|
||||||
|
logger.warning(f"Dispatcher DAG '{dispatcher_dag_id}' is paused. Stopping processing loop.")
|
||||||
|
return
|
||||||
|
|
||||||
# Create a new unique run_id for the dispatcher.
|
# Create a new unique run_id for the dispatcher.
|
||||||
# Using a timestamp and UUID ensures the ID is unique and does not grow in length over time,
|
# Using a timestamp and UUID ensures the ID is unique and does not grow in length over time,
|
||||||
# preventing database errors.
|
# preventing database errors.
|
||||||
@ -1270,7 +1555,7 @@ def continue_processing_loop(**context):
|
|||||||
|
|
||||||
logger.info(f"Worker finished successfully. Triggering a new dispatcher ('{new_dispatcher_run_id}') to continue the loop.")
|
logger.info(f"Worker finished successfully. Triggering a new dispatcher ('{new_dispatcher_run_id}') to continue the loop.")
|
||||||
trigger_dag(
|
trigger_dag(
|
||||||
dag_id='ytdlp_ops_v01_dispatcher',
|
dag_id=dispatcher_dag_id,
|
||||||
run_id=new_dispatcher_run_id,
|
run_id=new_dispatcher_run_id,
|
||||||
conf=conf_to_pass,
|
conf=conf_to_pass,
|
||||||
replace_microseconds=False
|
replace_microseconds=False
|
||||||
@ -1292,10 +1577,15 @@ def handle_retry_failure_branch(task_id_to_check: str, **context):
|
|||||||
error_message = error_details.get('error_message', '').strip()
|
error_message = error_details.get('error_message', '').strip()
|
||||||
error_code = error_details.get('error_code', '').strip()
|
error_code = error_details.get('error_code', '').strip()
|
||||||
|
|
||||||
# Check if this is an age confirmation error - should not stop the loop
|
# Unrecoverable video errors that should not be retried or treated as system failures.
|
||||||
if "Sign in to confirm your age" in error_message or "confirm your age" in error_message.lower():
|
unrecoverable_video_errors = [
|
||||||
logger.info(f"Age confirmation error detected on retry from '{task_id_to_check}'. This is a content restriction, not a bot detection issue.")
|
"AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "COPYRIGHT_REMOVAL",
|
||||||
return 'handle_age_restriction_error'
|
"GEO_RESTRICTED", "VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED"
|
||||||
|
]
|
||||||
|
|
||||||
|
if error_code in unrecoverable_video_errors:
|
||||||
|
logger.warning(f"Unrecoverable video error '{error_code}' detected on retry for '{task_id_to_check}'.")
|
||||||
|
return 'handle_unrecoverable_video_error'
|
||||||
|
|
||||||
if error_code == 'TRANSPORT_ERROR':
|
if error_code == 'TRANSPORT_ERROR':
|
||||||
logger.error(f"Fatal Thrift connection error on retry from '{task_id_to_check}'.")
|
logger.error(f"Fatal Thrift connection error on retry from '{task_id_to_check}'.")
|
||||||
@ -1337,6 +1627,17 @@ def handle_download_failure_branch(**context):
|
|||||||
# The full task_id for download_and_probe is 'download_processing.download_and_probe'
|
# The full task_id for download_and_probe is 'download_processing.download_and_probe'
|
||||||
download_error_details = ti.xcom_pull(task_ids='download_processing.download_and_probe', key='download_error_details')
|
download_error_details = ti.xcom_pull(task_ids='download_processing.download_and_probe', key='download_error_details')
|
||||||
|
|
||||||
|
if download_error_details:
|
||||||
|
error_code = download_error_details.get('error_code')
|
||||||
|
unrecoverable_video_errors = [
|
||||||
|
"AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "COPYRIGHT_REMOVAL",
|
||||||
|
"GEO_RESTRICTED", "VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED",
|
||||||
|
"HTTP_403_FORBIDDEN"
|
||||||
|
]
|
||||||
|
if error_code in unrecoverable_video_errors:
|
||||||
|
logger.warning(f"Unrecoverable video error '{error_code}' during download. Skipping.")
|
||||||
|
return 'handle_unrecoverable_video_error'
|
||||||
|
|
||||||
if policy == 'retry_with_new_token':
|
if policy == 'retry_with_new_token':
|
||||||
logger.info("Download failed. Policy is to retry with a new token. Branching to retry logic.")
|
logger.info("Download failed. Policy is to retry with a new token. Branching to retry logic.")
|
||||||
return 'retry_logic_for_download'
|
return 'retry_logic_for_download'
|
||||||
@ -1366,6 +1667,58 @@ def coalesce_token_data(get_token_result=None, retry_get_token_result=None):
|
|||||||
raise AirflowException("Could not find a successful token result from any attempt.")
|
raise AirflowException("Could not find a successful token result from any attempt.")
|
||||||
|
|
||||||
|
|
||||||
|
@task
|
||||||
|
def handle_unrecoverable_video_error(**context):
|
||||||
|
"""
|
||||||
|
Handles errors for videos that are unavailable (private, removed, etc.).
|
||||||
|
These are not system failures, so the URL is logged to a 'skipped' queue
|
||||||
|
and the processing loop continues without marking the run as failed.
|
||||||
|
"""
|
||||||
|
params = context['params']
|
||||||
|
ti = context['task_instance']
|
||||||
|
url = params.get('url_to_process', 'unknown')
|
||||||
|
|
||||||
|
# Collect error details from the failed task
|
||||||
|
error_details = {}
|
||||||
|
auth_error = ti.xcom_pull(task_ids='initial_attempt.get_token', key='error_details')
|
||||||
|
auth_retry_error = ti.xcom_pull(task_ids='retry_logic.retry_get_token', key='error_details')
|
||||||
|
download_error = ti.xcom_pull(task_ids='download_processing.download_and_probe', key='download_error_details')
|
||||||
|
|
||||||
|
if auth_retry_error: error_details = auth_retry_error
|
||||||
|
elif auth_error: error_details = auth_error
|
||||||
|
elif download_error: error_details = download_error
|
||||||
|
|
||||||
|
error_code = error_details.get('error_code', 'UNKNOWN_VIDEO_ERROR')
|
||||||
|
error_message = error_details.get('error_message', 'Video is unavailable for an unknown reason.')
|
||||||
|
|
||||||
|
logger.warning(f"Skipping URL '{url}' due to unrecoverable video error: {error_code} - {error_message}")
|
||||||
|
|
||||||
|
result_data = {
|
||||||
|
'status': 'skipped',
|
||||||
|
'end_time': time.time(),
|
||||||
|
'url': url,
|
||||||
|
'dag_run_id': context['dag_run'].run_id,
|
||||||
|
'reason': error_code,
|
||||||
|
'details': error_message,
|
||||||
|
'error_details': error_details
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
client = _get_redis_client(params['redis_conn_id'])
|
||||||
|
|
||||||
|
skipped_queue = f"{params['queue_name']}_skipped"
|
||||||
|
progress_queue = f"{params['queue_name']}_progress"
|
||||||
|
|
||||||
|
with client.pipeline() as pipe:
|
||||||
|
pipe.hset(skipped_queue, url, json.dumps(result_data))
|
||||||
|
pipe.hdel(progress_queue, url)
|
||||||
|
pipe.execute()
|
||||||
|
|
||||||
|
logger.info(f"Stored skipped result for URL '{url}' in '{skipped_queue}' and removed from progress queue.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Could not report skipped video to Redis: {e}", exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
@task
|
@task
|
||||||
def report_bannable_and_continue(**context):
|
def report_bannable_and_continue(**context):
|
||||||
"""
|
"""
|
||||||
@ -1428,71 +1781,6 @@ def report_bannable_and_continue(**context):
|
|||||||
logger.error(f"Could not report bannable error to Redis: {e}", exc_info=True)
|
logger.error(f"Could not report bannable error to Redis: {e}", exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
@task
|
|
||||||
def handle_age_restriction_error(**context):
|
|
||||||
"""
|
|
||||||
Handles age restriction errors specifically. These are content restrictions
|
|
||||||
that cannot be bypassed by using different accounts, so we report the failure
|
|
||||||
and continue the processing loop rather than stopping it.
|
|
||||||
"""
|
|
||||||
params = context['params']
|
|
||||||
ti = context['task_instance']
|
|
||||||
url = params.get('url_to_process', 'unknown')
|
|
||||||
|
|
||||||
# Collect error details
|
|
||||||
error_details = {}
|
|
||||||
first_token_task_id = 'get_token'
|
|
||||||
retry_token_task_id = 'retry_get_token'
|
|
||||||
|
|
||||||
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
|
|
||||||
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
|
|
||||||
|
|
||||||
# Use the most recent error details
|
|
||||||
if retry_token_error:
|
|
||||||
error_details = retry_token_error
|
|
||||||
elif first_token_error:
|
|
||||||
error_details = first_token_error
|
|
||||||
|
|
||||||
logger.error(f"Age restriction error for URL '{url}'. This content requires age confirmation and cannot be bypassed.")
|
|
||||||
|
|
||||||
# Report failure to Redis so the URL can be marked as failed
|
|
||||||
try:
|
|
||||||
client = _get_redis_client(params['redis_conn_id'])
|
|
||||||
|
|
||||||
# Update client-specific stats
|
|
||||||
try:
|
|
||||||
machine_id = params.get('machine_id') or socket.gethostname()
|
|
||||||
_update_client_stats(client, params.get('clients', ''), 'failure', url, machine_id, context['dag_run'].run_id)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Could not update client stats on age restriction error: {e}", exc_info=True)
|
|
||||||
|
|
||||||
result_data = {
|
|
||||||
'status': 'failed',
|
|
||||||
'end_time': time.time(),
|
|
||||||
'url': url,
|
|
||||||
'dag_run_id': context['dag_run'].run_id,
|
|
||||||
'error': 'age_restriction',
|
|
||||||
'error_message': 'Content requires age confirmation',
|
|
||||||
'error_details': error_details
|
|
||||||
}
|
|
||||||
result_queue = f"{params['queue_name']}_result"
|
|
||||||
fail_queue = f"{params['queue_name']}_fail"
|
|
||||||
|
|
||||||
progress_queue = f"{params['queue_name']}_progress"
|
|
||||||
|
|
||||||
with client.pipeline() as pipe:
|
|
||||||
pipe.hset(result_queue, url, json.dumps(result_data))
|
|
||||||
pipe.hset(fail_queue, url, json.dumps(result_data))
|
|
||||||
pipe.hdel(progress_queue, url)
|
|
||||||
pipe.execute()
|
|
||||||
|
|
||||||
logger.info(f"Stored age restriction error for URL '{url}' in '{result_queue}' and '{fail_queue}'.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Could not report age restriction error to Redis: {e}", exc_info=True)
|
|
||||||
|
|
||||||
# This is NOT a fatal error for the processing loop - we just continue with the next URL
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# DAG Definition with TaskGroups
|
# DAG Definition with TaskGroups
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@ -1533,28 +1821,23 @@ with DAG(
|
|||||||
description="Policy for handling download or probe failures."
|
description="Policy for handling download or probe failures."
|
||||||
),
|
),
|
||||||
'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."),
|
'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."),
|
||||||
|
'language_code': Param('en-US', type="string", title="[Worker Param] Language Code", description="The language code (e.g., 'en-US', 'de-DE') to use for the YouTube request headers."),
|
||||||
'retry_on_probe_failure': Param(False, type="boolean"),
|
'retry_on_probe_failure': Param(False, type="boolean"),
|
||||||
'skip_probe': Param(False, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
|
'skip_probe': Param(False, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
|
||||||
'yt_dlp_cleanup_mode': Param(True, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
|
'yt_dlp_cleanup_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
|
||||||
'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean"),
|
'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean"),
|
||||||
|
'fragment_retries': Param(2, type="integer", title="[Worker Param] Fragment Retries", description="Number of retries for a fragment before giving up. Default is 2 to fail fast on expired tokens."),
|
||||||
'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."),
|
'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."),
|
||||||
'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."),
|
'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."),
|
||||||
'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."),
|
'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."),
|
||||||
'download_format_preset': Param(
|
'download_format': Param(
|
||||||
'format_1',
|
'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
|
||||||
type="string",
|
type="string",
|
||||||
enum=['format_1', 'format_2', 'custom'],
|
title="[Worker Param] Download Format",
|
||||||
title="Download Format Preset",
|
description="Custom yt-dlp format string. Common presets: [1] 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' (Default, best quality MP4). [2] '18-dashy/18,140-dashy/140,133-dashy/134-dashy/136-dashy/137-dashy/250-dashy/298-dashy/299-dashy' (Legacy formats). [3] '299-dashy/298-dashy/250-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy' (High-framerate formats)."
|
||||||
description="Select a predefined format string or choose 'custom'.\nformat_1: 18-dashy/18,140-dashy/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformat_2: (299/298/137/136/135/134/133)-dashy"
|
|
||||||
),
|
|
||||||
'download_format_custom': Param(
|
|
||||||
'18-dashy/18,140-dashy/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy',
|
|
||||||
type="string",
|
|
||||||
title="Custom Download Format",
|
|
||||||
description="Custom yt-dlp format string. Used when preset is 'custom'. To download multiple formats, provide a comma-separated list of format IDs (e.g., '137,140')."
|
|
||||||
),
|
),
|
||||||
'downloader': Param(
|
'downloader': Param(
|
||||||
'py',
|
'cli',
|
||||||
type="string",
|
type="string",
|
||||||
enum=['py', 'aria-rpc', 'cli'],
|
enum=['py', 'aria-rpc', 'cli'],
|
||||||
title="Download Tool",
|
title="Download Tool",
|
||||||
@ -1564,12 +1847,12 @@ with DAG(
|
|||||||
'aria_port': Param(6800, type="integer", title="Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server."),
|
'aria_port': Param(6800, type="integer", title="Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server."),
|
||||||
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="Aria2c Secret", description="For 'aria-rpc' downloader: Secret token."),
|
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="Aria2c Secret", description="For 'aria-rpc' downloader: Secret token."),
|
||||||
'yt_dlp_extra_args': Param(
|
'yt_dlp_extra_args': Param(
|
||||||
'--no-resize-buffer --buffer-size 4M --min-sleep-interval 5 --max-sleep-interval 10',
|
'',
|
||||||
type=["string", "null"],
|
type=["string", "null"],
|
||||||
title="Extra yt-dlp arguments",
|
title="Extra yt-dlp arguments",
|
||||||
),
|
),
|
||||||
# --- Manual Run / Internal Parameters ---
|
# --- Manual Run / Internal Parameters ---
|
||||||
'manual_url_to_process': Param('iPwdia3gAnk', type=["string", "null"], title="[Manual Run] URL to Process", description="For manual runs, provide a single YouTube URL to process. This is ignored if triggered by the dispatcher."),
|
'manual_url_to_process': Param('iPwdia3gAnk', type=["string", "null"], title="[Manual Run] URL to Process", description="For manual runs, provide a single YouTube URL, or the special value 'PULL_FROM_QUEUE' to pull one URL from the Redis inbox. This is ignored if triggered by the dispatcher."),
|
||||||
'url_to_process': Param(None, type=["string", "null"], title="[Internal] URL from Dispatcher", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
|
'url_to_process': Param(None, type=["string", "null"], title="[Internal] URL from Dispatcher", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
|
||||||
'worker_queue': Param(None, type=["string", "null"], title="[Internal] Worker Queue", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
|
'worker_queue': Param(None, type=["string", "null"], title="[Internal] Worker Queue", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
|
||||||
}
|
}
|
||||||
@ -1583,7 +1866,7 @@ with DAG(
|
|||||||
report_failure_and_stop_task = report_failure_and_stop()
|
report_failure_and_stop_task = report_failure_and_stop()
|
||||||
report_failure_task = report_failure_and_continue()
|
report_failure_task = report_failure_and_continue()
|
||||||
continue_loop_task = continue_processing_loop()
|
continue_loop_task = continue_processing_loop()
|
||||||
age_restriction_task = handle_age_restriction_error()
|
unrecoverable_video_error_task = handle_unrecoverable_video_error()
|
||||||
report_bannable_and_continue_task = report_bannable_and_continue()
|
report_bannable_and_continue_task = report_bannable_and_continue()
|
||||||
|
|
||||||
# --- Task Group 1: Initial Attempt ---
|
# --- Task Group 1: Initial Attempt ---
|
||||||
@ -1600,7 +1883,7 @@ with DAG(
|
|||||||
)
|
)
|
||||||
|
|
||||||
first_token_attempt >> initial_branch_task
|
first_token_attempt >> initial_branch_task
|
||||||
initial_branch_task >> [fatal_error_task, ban_and_report_immediately_task, age_restriction_task, report_bannable_and_continue_task]
|
initial_branch_task >> [fatal_error_task, ban_and_report_immediately_task, unrecoverable_video_error_task, report_bannable_and_continue_task]
|
||||||
|
|
||||||
# --- Task Group 2: Retry Logic ---
|
# --- Task Group 2: Retry Logic ---
|
||||||
with TaskGroup("retry_logic", tooltip="Retry logic with account management") as retry_logic_group:
|
with TaskGroup("retry_logic", tooltip="Retry logic with account management") as retry_logic_group:
|
||||||
@ -1650,7 +1933,7 @@ with DAG(
|
|||||||
direct_retry_account_task >> coalesced_retry_data
|
direct_retry_account_task >> coalesced_retry_data
|
||||||
coalesced_retry_data >> retry_token_task
|
coalesced_retry_data >> retry_token_task
|
||||||
retry_token_task >> retry_branch_task
|
retry_token_task >> retry_branch_task
|
||||||
retry_branch_task >> [fatal_error_task, report_failure_task, ban_after_retry_report_task, age_restriction_task, report_bannable_and_continue_task]
|
retry_branch_task >> [fatal_error_task, report_failure_task, ban_after_retry_report_task, unrecoverable_video_error_task, report_bannable_and_continue_task]
|
||||||
ban_after_retry_report_task >> report_failure_and_stop_task
|
ban_after_retry_report_task >> report_failure_and_stop_task
|
||||||
|
|
||||||
# --- Task Group 3: Download and Processing ---
|
# --- Task Group 3: Download and Processing ---
|
||||||
@ -1759,18 +2042,18 @@ with DAG(
|
|||||||
|
|
||||||
# --- DAG Dependencies between TaskGroups ---
|
# --- DAG Dependencies between TaskGroups ---
|
||||||
# Initial attempt can lead to retry logic or direct failure
|
# Initial attempt can lead to retry logic or direct failure
|
||||||
initial_branch_task >> [retry_logic_group, fatal_error_task, ban_and_report_immediately_task, age_restriction_task, report_bannable_and_continue_task]
|
initial_branch_task >> [retry_logic_group, fatal_error_task, ban_and_report_immediately_task, unrecoverable_video_error_task, report_bannable_and_continue_task]
|
||||||
|
|
||||||
# Ban and report immediately leads to failure reporting
|
# Ban and report immediately leads to failure reporting
|
||||||
ban_and_report_immediately_task >> report_failure_and_stop_task
|
ban_and_report_immediately_task >> report_failure_and_stop_task
|
||||||
|
|
||||||
# Age restriction error leads to failure reporting and continues the loop
|
# Unrecoverable/bannable errors that don't stop the loop should continue processing
|
||||||
age_restriction_task >> continue_loop_task
|
unrecoverable_video_error_task >> continue_loop_task
|
||||||
report_bannable_and_continue_task >> continue_loop_task
|
report_bannable_and_continue_task >> continue_loop_task
|
||||||
report_failure_task >> continue_loop_task
|
report_failure_task >> continue_loop_task
|
||||||
|
|
||||||
# Connect download failure branch to the new retry group
|
# Connect download failure branch to the new retry group
|
||||||
download_branch_task >> [retry_logic_for_download_group, report_failure_task, fatal_error_task]
|
download_branch_task >> [retry_logic_for_download_group, report_failure_task, fatal_error_task, unrecoverable_video_error_task]
|
||||||
|
|
||||||
# Connect success paths to the coalescing tasks
|
# Connect success paths to the coalescing tasks
|
||||||
download_task >> final_files
|
download_task >> final_files
|
||||||
|
|||||||
@ -18,7 +18,7 @@ from airflow.utils.dates import days_ago
|
|||||||
from airflow.api.common.trigger_dag import trigger_dag
|
from airflow.api.common.trigger_dag import trigger_dag
|
||||||
from airflow.models.dagrun import DagRun
|
from airflow.models.dagrun import DagRun
|
||||||
from airflow.models.dag import DagModel
|
from airflow.models.dag import DagModel
|
||||||
from datetime import timedelta
|
from datetime import timedelta, datetime
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
@ -35,41 +35,6 @@ from thrift.transport import TSocket, TTransport
|
|||||||
# Configure logging
|
# Configure logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
DEFAULT_REQUEST_PARAMS_JSON = """{
|
|
||||||
"context_reuse_policy": {
|
|
||||||
"enabled": true,
|
|
||||||
"max_age_seconds": 86400,
|
|
||||||
"reuse_visitor_id": true,
|
|
||||||
"reuse_cookies": true
|
|
||||||
},
|
|
||||||
"token_generation_strategy": {
|
|
||||||
"youtubei_js": {
|
|
||||||
"generate_po_token": true,
|
|
||||||
"generate_gvs_token": true
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"ytdlp_params": {
|
|
||||||
"use_curl_prefetch": false,
|
|
||||||
"token_supplement_strategy": {
|
|
||||||
"youtubepot_bgutilhttp_extractor": {
|
|
||||||
"enabled": true
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"visitor_id_override": {
|
|
||||||
"enabled": true
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"session_params": {
|
|
||||||
"lang": "en-US",
|
|
||||||
"location": "US",
|
|
||||||
"deviceCategory": "MOBILE",
|
|
||||||
"user_agents": {
|
|
||||||
"youtubei_js": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)",
|
|
||||||
"yt_dlp": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}"""
|
|
||||||
|
|
||||||
# Default settings
|
# Default settings
|
||||||
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
||||||
DEFAULT_TOTAL_WORKERS = 8
|
DEFAULT_TOTAL_WORKERS = 8
|
||||||
@ -188,6 +153,17 @@ def orchestrate_workers_ignition_callable(**context):
|
|||||||
dag_run_id = context['dag_run'].run_id
|
dag_run_id = context['dag_run'].run_id
|
||||||
total_triggered = 0
|
total_triggered = 0
|
||||||
|
|
||||||
|
# --- Generate a consistent timestamped prefix for this orchestrator run ---
|
||||||
|
# This ensures all workers spawned from this run use the same set of accounts.
|
||||||
|
final_account_pool_prefix = params['account_pool']
|
||||||
|
if params.get('prepend_client_to_account') and params.get('account_pool_size') is not None:
|
||||||
|
clients_str = params.get('clients', '')
|
||||||
|
primary_client = clients_str.split(',')[0].strip() if clients_str else 'unknown'
|
||||||
|
# Use a timestamp from the orchestrator's run for consistency
|
||||||
|
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
|
||||||
|
final_account_pool_prefix = f"{params['account_pool']}_{timestamp}_{primary_client}"
|
||||||
|
logger.info(f"Generated consistent account prefix for this run: '{final_account_pool_prefix}'")
|
||||||
|
|
||||||
for i, bunch in enumerate(bunches):
|
for i, bunch in enumerate(bunches):
|
||||||
logger.info(f"--- Triggering Bunch {i+1}/{len(bunches)} (contains {len(bunch)} dispatcher(s)) ---")
|
logger.info(f"--- Triggering Bunch {i+1}/{len(bunches)} (contains {len(bunch)} dispatcher(s)) ---")
|
||||||
for j, _ in enumerate(bunch):
|
for j, _ in enumerate(bunch):
|
||||||
@ -196,6 +172,8 @@ def orchestrate_workers_ignition_callable(**context):
|
|||||||
|
|
||||||
# Pass all orchestrator params to the dispatcher, which will then pass them to the worker.
|
# Pass all orchestrator params to the dispatcher, which will then pass them to the worker.
|
||||||
conf_to_pass = {p: params[p] for p in params}
|
conf_to_pass = {p: params[p] for p in params}
|
||||||
|
# Override account_pool with the generated prefix
|
||||||
|
conf_to_pass['account_pool'] = final_account_pool_prefix
|
||||||
|
|
||||||
logger.info(f"Triggering dispatcher {j+1}/{len(bunch)} in bunch {i+1} (run {total_triggered + 1}/{total_workers}) (Run ID: {run_id})")
|
logger.info(f"Triggering dispatcher {j+1}/{len(bunch)} in bunch {i+1} (run {total_triggered + 1}/{total_workers}) (Run ID: {run_id})")
|
||||||
logger.debug(f"Full conf for dispatcher run {run_id}: {conf_to_pass}")
|
logger.debug(f"Full conf for dispatcher run {run_id}: {conf_to_pass}")
|
||||||
@ -294,17 +272,12 @@ with DAG(
|
|||||||
"'proceed_loop_under_manual_inspection': **BEWARE: MANUAL SUPERVISION REQUIRED.** Marks the URL as failed but continues the processing loop. Use this only when you can manually intervene by pausing the dispatcher DAG or creating a lock file (`/opt/airflow/inputfiles/AIRFLOW.PREVENT_URL_PULL.lockfile`) to prevent a runaway failure loop."
|
"'proceed_loop_under_manual_inspection': **BEWARE: MANUAL SUPERVISION REQUIRED.** Marks the URL as failed but continues the processing loop. Use this only when you can manually intervene by pausing the dispatcher DAG or creating a lock file (`/opt/airflow/inputfiles/AIRFLOW.PREVENT_URL_PULL.lockfile`) to prevent a runaway failure loop."
|
||||||
"'stop_loop_on_auth_proceed_on_download_error': **(Default)** Stops the loop on an authentication/token error (like 'stop_loop'), but continues the loop on a download/probe error (like 'proceed...')."
|
"'stop_loop_on_auth_proceed_on_download_error': **(Default)** Stops the loop on an authentication/token error (like 'stop_loop'), but continues the loop on a download/probe error (like 'proceed...')."
|
||||||
),
|
),
|
||||||
'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."),
|
'request_params_json': Param('{}', type="string", title="[Worker Param] Request Params JSON", description="JSON string with per-request parameters to override server defaults. Can be a full JSON object or comma-separated key=value pairs (e.g., 'session_params.location=DE,ytdlp_params.skip_cache=true')."),
|
||||||
|
'language_code': Param('en-US', type="string", title="[Worker Param] Language Code", description="The language code (e.g., 'en-US', 'de-DE') to use for the YouTube request headers."),
|
||||||
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
|
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
|
||||||
'clients': Param(
|
'clients': Param(
|
||||||
'tv_simply',
|
'tv_simply',
|
||||||
type="string",
|
type="string",
|
||||||
enum=[
|
|
||||||
'tv_simply',
|
|
||||||
'mweb',
|
|
||||||
'tv',
|
|
||||||
'custom',
|
|
||||||
],
|
|
||||||
title="[Worker Param] Clients",
|
title="[Worker Param] Clients",
|
||||||
description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details."
|
description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details."
|
||||||
),
|
),
|
||||||
|
|||||||
@ -249,27 +249,20 @@ with DAG(
|
|||||||
'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."),
|
'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."),
|
||||||
'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."),
|
'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."),
|
||||||
'skip_probe': Param(True, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
|
'skip_probe': Param(True, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
|
||||||
'yt_dlp_cleanup_mode': Param(True, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
|
'yt_dlp_cleanup_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
|
||||||
'fragment_retries': Param(2, type="integer", title="[Worker Param] Fragment Retries", description="Number of retries for a fragment before giving up."),
|
'fragment_retries': Param(2, type="integer", title="[Worker Param] Fragment Retries", description="Number of retries for a fragment before giving up."),
|
||||||
'limit_rate': Param('5M', type=["string", "null"], title="[Worker Param] Limit Rate", description="Download speed limit (e.g., 50K, 4.2M)."),
|
'limit_rate': Param('5M', type=["string", "null"], title="[Worker Param] Limit Rate", description="Download speed limit (e.g., 50K, 4.2M)."),
|
||||||
'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."),
|
'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."),
|
||||||
'min_sleep_interval': Param(5, type="integer", title="[Worker Param] Min Sleep Interval", description="Minimum time to sleep between downloads (seconds)."),
|
'min_sleep_interval': Param(5, type="integer", title="[Worker Param] Min Sleep Interval", description="Minimum time to sleep between downloads (seconds)."),
|
||||||
'max_sleep_interval': Param(10, type="integer", title="[Worker Param] Max Sleep Interval", description="Maximum time to sleep between downloads (seconds)."),
|
'max_sleep_interval': Param(10, type="integer", title="[Worker Param] Max Sleep Interval", description="Maximum time to sleep between downloads (seconds)."),
|
||||||
'download_format_preset': Param(
|
'download_format': Param(
|
||||||
'formats_2',
|
'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
|
||||||
type="string",
|
type="string",
|
||||||
enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'],
|
title="[Worker Param] Download Format",
|
||||||
title="[Worker Param] Download Format Preset",
|
description="Custom yt-dlp format string. Common presets: [1] 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' (Default, best quality MP4). [2] '18-dashy/18,140-dashy/140,133-dashy/134-dashy/136-dashy/137-dashy/250-dashy/298-dashy/299-dashy' (Legacy formats). [3] '299-dashy/298-dashy/250-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy' (High-framerate formats)."
|
||||||
description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformats_0: 18,140\nformats_2: 18,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
|
|
||||||
),
|
|
||||||
'download_format_custom': Param(
|
|
||||||
'18,140-dashy,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy',
|
|
||||||
type="string",
|
|
||||||
title="[Worker Param] Custom Download Format",
|
|
||||||
description="Custom yt-dlp format string. Used when preset is 'custom'. E.g., 'ba[ext=m4a]/bestaudio/best'."
|
|
||||||
),
|
),
|
||||||
'downloader': Param(
|
'downloader': Param(
|
||||||
'py',
|
'cli',
|
||||||
type="string",
|
type="string",
|
||||||
enum=['py', 'aria-rpc', 'cli'],
|
enum=['py', 'aria-rpc', 'cli'],
|
||||||
title="[Worker Param] Download Tool",
|
title="[Worker Param] Download Tool",
|
||||||
@ -279,7 +272,7 @@ with DAG(
|
|||||||
'aria_port': Param(6800, type="integer", title="[Worker Param] Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_PORT'."),
|
'aria_port': Param(6800, type="integer", title="[Worker Param] Aria2c Port", description="For 'aria-rpc' downloader: Port of the aria2c RPC server. Can be set via Airflow Variable 'YTDLP_ARIA_PORT'."),
|
||||||
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="[Worker Param] Aria2c Secret", description="For 'aria-rpc' downloader: Secret token. Can be set via Airflow Variable 'YTDLP_ARIA_SECRET'."),
|
'aria_secret': Param('SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX', type="string", title="[Worker Param] Aria2c Secret", description="For 'aria-rpc' downloader: Secret token. Can be set via Airflow Variable 'YTDLP_ARIA_SECRET'."),
|
||||||
'yt_dlp_extra_args': Param(
|
'yt_dlp_extra_args': Param(
|
||||||
'--restrict-filenames',
|
'--no-part --restrict-filenames',
|
||||||
type=["string", "null"],
|
type=["string", "null"],
|
||||||
title="[Worker Param] Extra yt-dlp arguments",
|
title="[Worker Param] Extra yt-dlp arguments",
|
||||||
description="Extra command-line arguments for yt-dlp during download."
|
description="Extra command-line arguments for yt-dlp during download."
|
||||||
|
|||||||
@ -17,14 +17,14 @@ from __future__ import annotations
|
|||||||
from airflow.decorators import task, task_group
|
from airflow.decorators import task, task_group
|
||||||
from airflow.exceptions import AirflowException, AirflowSkipException
|
from airflow.exceptions import AirflowException, AirflowSkipException
|
||||||
from airflow.models import Variable
|
from airflow.models import Variable
|
||||||
from airflow.models.dag import DAG
|
from airflow.models.dag import DAG, DagModel
|
||||||
from airflow.models.param import Param
|
from airflow.models.param import Param
|
||||||
from airflow.models.xcom_arg import XComArg
|
from airflow.models.xcom_arg import XComArg
|
||||||
from airflow.operators.dummy import DummyOperator
|
from airflow.operators.dummy import DummyOperator
|
||||||
from airflow.utils.dates import days_ago
|
from airflow.utils.dates import days_ago
|
||||||
from airflow.utils.task_group import TaskGroup
|
from airflow.utils.task_group import TaskGroup
|
||||||
from airflow.api.common.trigger_dag import trigger_dag
|
from airflow.api.common.trigger_dag import trigger_dag
|
||||||
from copy import copy
|
import copy
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import json
|
import json
|
||||||
@ -143,10 +143,12 @@ DEFAULT_REQUEST_PARAMS = {
|
|||||||
"session_params": {
|
"session_params": {
|
||||||
"lang": "en-US",
|
"lang": "en-US",
|
||||||
"location": "US",
|
"location": "US",
|
||||||
"deviceCategory": "MOBILE",
|
"deviceCategory": "TV",
|
||||||
"user_agents": {
|
"user_agents": {
|
||||||
"youtubei_js": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)",
|
# "youtubei_js": "Mozilla/5.0 (Linux; Cobalt) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
|
||||||
"yt_dlp": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)"
|
"youtubei_js": "Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version",
|
||||||
|
# "yt_dlp": "Mozilla/5.0 (Linux; Cobalt) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
|
||||||
|
"yt_dlp": "Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -208,14 +210,9 @@ def _get_account_pool(params: dict) -> list:
|
|||||||
is_prefix_mode = True
|
is_prefix_mode = True
|
||||||
pool_size = int(pool_size_param)
|
pool_size = int(pool_size_param)
|
||||||
|
|
||||||
if params.get('prepend_client_to_account', True):
|
# The orchestrator now generates the full prefix if prepend_client_to_account is True.
|
||||||
clients_str = params.get('clients', '')
|
# The worker just appends the numbers.
|
||||||
primary_client = clients_str.split(',')[0].strip() if clients_str else 'unknown'
|
accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
|
||||||
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
|
|
||||||
new_prefix = f"{prefix}_{timestamp}_{primary_client}"
|
|
||||||
accounts = [f"{new_prefix}_{i:02d}" for i in range(1, pool_size + 1)]
|
|
||||||
else:
|
|
||||||
accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
|
|
||||||
else:
|
else:
|
||||||
accounts = [prefix]
|
accounts = [prefix]
|
||||||
|
|
||||||
@ -347,12 +344,26 @@ def get_url_and_assign_account(**context):
|
|||||||
# For manual runs, we fall back to 'manual_url_to_process'.
|
# For manual runs, we fall back to 'manual_url_to_process'.
|
||||||
url_to_process = params.get('url_to_process')
|
url_to_process = params.get('url_to_process')
|
||||||
if not url_to_process:
|
if not url_to_process:
|
||||||
url_to_process = params.get('manual_url_to_process')
|
manual_url_input = params.get('manual_url_to_process')
|
||||||
if url_to_process:
|
if manual_url_input:
|
||||||
logger.info(f"Using URL from manual run parameter: '{url_to_process}'")
|
logger.info(f"Using URL from manual run parameter: '{manual_url_input}'")
|
||||||
|
if manual_url_input == 'PULL_FROM_QUEUE':
|
||||||
|
logger.info("Manual run is set to pull from queue.")
|
||||||
|
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
||||||
|
queue_name = params.get('queue_name', DEFAULT_QUEUE_NAME)
|
||||||
|
inbox_queue = f"{queue_name}_inbox"
|
||||||
|
client = _get_redis_client(redis_conn_id)
|
||||||
|
url_bytes = client.lpop(inbox_queue)
|
||||||
|
if not url_bytes:
|
||||||
|
logger.info("Redis queue is empty. No work to do. Skipping task.")
|
||||||
|
raise AirflowSkipException("Redis queue is empty. No work to do.")
|
||||||
|
url_to_process = url_bytes.decode('utf-8')
|
||||||
|
logger.info(f"Pulled URL '{url_to_process}' from queue '{inbox_queue}'.")
|
||||||
|
else:
|
||||||
|
url_to_process = manual_url_input
|
||||||
|
|
||||||
if not url_to_process:
|
if not url_to_process:
|
||||||
raise AirflowException("No URL to process. For manual runs, please provide a URL in the 'manual_url_to_process' parameter.")
|
raise AirflowException("No URL to process. For manual runs, please provide a URL in the 'manual_url_to_process' parameter, or 'PULL_FROM_QUEUE'.")
|
||||||
logger.info(f"Received URL '{url_to_process}' to process.")
|
logger.info(f"Received URL '{url_to_process}' to process.")
|
||||||
|
|
||||||
# Mark the URL as in-progress in Redis
|
# Mark the URL as in-progress in Redis
|
||||||
@ -399,9 +410,29 @@ def get_token(initial_data: dict, **context):
|
|||||||
host, port = params['service_ip'], int(params['service_port'])
|
host, port = params['service_ip'], int(params['service_port'])
|
||||||
machine_id = params.get('machine_id') or socket.gethostname()
|
machine_id = params.get('machine_id') or socket.gethostname()
|
||||||
clients = params.get('clients')
|
clients = params.get('clients')
|
||||||
request_params_json = params.get('request_params_json', '{}')
|
request_params_json = params.get('request_params_json')
|
||||||
|
language_code = params.get('language_code')
|
||||||
assigned_proxy_url = params.get('assigned_proxy_url')
|
assigned_proxy_url = params.get('assigned_proxy_url')
|
||||||
|
|
||||||
|
if language_code:
|
||||||
|
try:
|
||||||
|
params_dict = json.loads(request_params_json)
|
||||||
|
if not params_dict:
|
||||||
|
params_dict = copy.deepcopy(DEFAULT_REQUEST_PARAMS)
|
||||||
|
|
||||||
|
logger.info(f"Setting language for request: {language_code}")
|
||||||
|
if 'session_params' not in params_dict:
|
||||||
|
params_dict['session_params'] = {}
|
||||||
|
params_dict['session_params']['lang'] = language_code
|
||||||
|
request_params_json = json.dumps(params_dict)
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
logger.warning("Could not parse request_params_json as JSON. Treating as key=value pairs and appending language code.")
|
||||||
|
lang_kv = f"session_params.lang={language_code}"
|
||||||
|
if request_params_json:
|
||||||
|
request_params_json += f",{lang_kv}"
|
||||||
|
else:
|
||||||
|
request_params_json = lang_kv
|
||||||
|
|
||||||
video_id = _extract_video_id(url)
|
video_id = _extract_video_id(url)
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
job_dir_name = f"{timestamp}-{video_id or 'unknown'}"
|
job_dir_name = f"{timestamp}-{video_id or 'unknown'}"
|
||||||
@ -445,18 +476,39 @@ def get_token(initial_data: dict, **context):
|
|||||||
|
|
||||||
if process.returncode != 0:
|
if process.returncode != 0:
|
||||||
error_message = "ytops-client failed. See logs for details."
|
error_message = "ytops-client failed. See logs for details."
|
||||||
for line in reversed(process.stderr.strip().split('\n')):
|
# Try to find a more specific error message from the Thrift client's output
|
||||||
if 'ERROR' in line or 'Thrift error' in line or 'Connection to server failed' in line:
|
thrift_error_match = re.search(r'A Thrift error occurred: (.*)', process.stderr)
|
||||||
error_message = line.strip()
|
if thrift_error_match:
|
||||||
break
|
error_message = thrift_error_match.group(1).strip()
|
||||||
|
else: # Fallback to old line-by-line parsing
|
||||||
|
for line in reversed(process.stderr.strip().split('\n')):
|
||||||
|
if 'ERROR' in line or 'Thrift error' in line or 'Connection to server failed' in line:
|
||||||
|
error_message = line.strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
# Determine error code for branching logic
|
||||||
error_code = 'GET_INFO_CLIENT_FAIL'
|
error_code = 'GET_INFO_CLIENT_FAIL'
|
||||||
if "BOT_DETECTED" in process.stderr:
|
stderr_lower = process.stderr.lower()
|
||||||
error_code = "BOT_DETECTED"
|
|
||||||
elif "BOT_DETECTION_SIGN_IN_REQUIRED" in process.stderr:
|
# These patterns should match the error codes from PBUserException and others
|
||||||
error_code = "BOT_DETECTION_SIGN_IN_REQUIRED"
|
error_patterns = {
|
||||||
elif "Connection to server failed" in process.stderr:
|
"BOT_DETECTED": ["bot_detected"],
|
||||||
error_code = "TRANSPORT_ERROR"
|
"BOT_DETECTION_SIGN_IN_REQUIRED": ["bot_detection_sign_in_required"],
|
||||||
|
"TRANSPORT_ERROR": ["connection to server failed"],
|
||||||
|
"PRIVATE_VIDEO": ["private video"],
|
||||||
|
"COPYRIGHT_REMOVAL": ["copyright"],
|
||||||
|
"GEO_RESTRICTED": ["in your country"],
|
||||||
|
"VIDEO_REMOVED": ["video has been removed"],
|
||||||
|
"VIDEO_UNAVAILABLE": ["video unavailable"],
|
||||||
|
"MEMBERS_ONLY": ["members-only"],
|
||||||
|
"AGE_GATED_SIGN_IN": ["sign in to confirm your age"],
|
||||||
|
"VIDEO_PROCESSING": ["processing this video"],
|
||||||
|
}
|
||||||
|
|
||||||
|
for code, patterns in error_patterns.items():
|
||||||
|
if any(p in stderr_lower for p in patterns):
|
||||||
|
error_code = code
|
||||||
|
break # Found a match, stop searching
|
||||||
|
|
||||||
error_details = {
|
error_details = {
|
||||||
'error_message': error_message,
|
'error_message': error_message,
|
||||||
@ -471,8 +523,23 @@ def get_token(initial_data: dict, **context):
|
|||||||
if proxy_match:
|
if proxy_match:
|
||||||
proxy = proxy_match.group(1).strip()
|
proxy = proxy_match.group(1).strip()
|
||||||
|
|
||||||
|
# Rename the info.json to include the proxy for the download worker
|
||||||
|
final_info_json_path = info_json_path
|
||||||
|
if proxy:
|
||||||
|
# Sanitize for filename: replace '://' which is invalid in paths. Colons are usually fine.
|
||||||
|
sanitized_proxy = proxy.replace('://', '---')
|
||||||
|
|
||||||
|
new_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}_proxy_{sanitized_proxy}.json"
|
||||||
|
new_path = os.path.join(job_dir_path, new_filename)
|
||||||
|
try:
|
||||||
|
os.rename(info_json_path, new_path)
|
||||||
|
final_info_json_path = new_path
|
||||||
|
logger.info(f"Renamed info.json to include proxy: {new_path}")
|
||||||
|
except OSError as e:
|
||||||
|
logger.error(f"Failed to rename info.json to include proxy: {e}. Using original path.")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'info_json_path': info_json_path,
|
'info_json_path': final_info_json_path,
|
||||||
'job_dir_path': job_dir_path,
|
'job_dir_path': job_dir_path,
|
||||||
'socks_proxy': proxy,
|
'socks_proxy': proxy,
|
||||||
'ytdlp_command': None,
|
'ytdlp_command': None,
|
||||||
@ -498,10 +565,15 @@ def handle_bannable_error_branch(task_id_to_check: str, **context):
|
|||||||
error_code = error_details.get('error_code', '').strip()
|
error_code = error_details.get('error_code', '').strip()
|
||||||
policy = params.get('on_bannable_failure', 'retry_with_new_account')
|
policy = params.get('on_bannable_failure', 'retry_with_new_account')
|
||||||
|
|
||||||
# Check if this is an age confirmation error - should not stop the loop
|
# Unrecoverable video errors that should not be retried or treated as system failures.
|
||||||
if "Sign in to confirm your age" in error_message or "confirm your age" in error_message.lower():
|
unrecoverable_video_errors = [
|
||||||
logger.info(f"Age confirmation error detected for '{task_id_to_check}'. This is a content restriction, not a bot detection issue.")
|
"AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "COPYRIGHT_REMOVAL",
|
||||||
return 'handle_age_restriction_error'
|
"GEO_RESTRICTED", "VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED"
|
||||||
|
]
|
||||||
|
|
||||||
|
if error_code in unrecoverable_video_errors:
|
||||||
|
logger.warning(f"Unrecoverable video error '{error_code}' detected for '{task_id_to_check}'. This is a content issue, not a system failure.")
|
||||||
|
return 'handle_unrecoverable_video_error'
|
||||||
|
|
||||||
# Fatal Thrift connection errors that should stop all processing.
|
# Fatal Thrift connection errors that should stop all processing.
|
||||||
if error_code == 'TRANSPORT_ERROR':
|
if error_code == 'TRANSPORT_ERROR':
|
||||||
@ -718,6 +790,59 @@ def push_auth_success_to_redis(initial_data: dict, token_data: dict, **context):
|
|||||||
logger.info(f"Pushed successful auth data for URL '{url}' to '{dl_inbox_queue}'.")
|
logger.info(f"Pushed successful auth data for URL '{url}' to '{dl_inbox_queue}'.")
|
||||||
logger.info(f"Stored success result for auth on URL '{url}' in '{auth_result_queue}'.")
|
logger.info(f"Stored success result for auth on URL '{url}' in '{auth_result_queue}'.")
|
||||||
|
|
||||||
|
@task
|
||||||
|
def handle_unrecoverable_video_error(**context):
|
||||||
|
"""
|
||||||
|
Handles errors for videos that are unavailable (private, removed, etc.).
|
||||||
|
These are not system failures, so the URL is logged to a 'skipped' queue
|
||||||
|
and the processing loop continues without marking the run as failed.
|
||||||
|
"""
|
||||||
|
params = context['params']
|
||||||
|
ti = context['task_instance']
|
||||||
|
url = params.get('url_to_process', 'unknown')
|
||||||
|
|
||||||
|
# Collect error details from the failed get_token task
|
||||||
|
error_details = {}
|
||||||
|
first_token_error = ti.xcom_pull(task_ids='initial_attempt.get_token', key='error_details')
|
||||||
|
retry_token_error = ti.xcom_pull(task_ids='retry_logic.retry_get_token', key='error_details')
|
||||||
|
|
||||||
|
if retry_token_error:
|
||||||
|
error_details = retry_token_error
|
||||||
|
elif first_token_error:
|
||||||
|
error_details = first_token_error
|
||||||
|
|
||||||
|
error_code = error_details.get('error_code', 'UNKNOWN_VIDEO_ERROR')
|
||||||
|
error_message = error_details.get('error_message', 'Video is unavailable for an unknown reason.')
|
||||||
|
|
||||||
|
logger.warning(f"Skipping URL '{url}' due to unrecoverable video error: {error_code} - {error_message}")
|
||||||
|
|
||||||
|
result_data = {
|
||||||
|
'status': 'skipped',
|
||||||
|
'end_time': time.time(),
|
||||||
|
'url': url,
|
||||||
|
'dag_run_id': context['dag_run'].run_id,
|
||||||
|
'reason': error_code,
|
||||||
|
'details': error_message,
|
||||||
|
'error_details': error_details
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
client = _get_redis_client(params['redis_conn_id'])
|
||||||
|
|
||||||
|
# New queue for skipped videos
|
||||||
|
skipped_queue = f"{params['queue_name']}_skipped"
|
||||||
|
progress_queue = f"{params['queue_name']}_progress"
|
||||||
|
|
||||||
|
with client.pipeline() as pipe:
|
||||||
|
pipe.hset(skipped_queue, url, json.dumps(result_data))
|
||||||
|
pipe.hdel(progress_queue, url)
|
||||||
|
pipe.execute()
|
||||||
|
|
||||||
|
logger.info(f"Stored skipped result for URL '{url}' in '{skipped_queue}' and removed from progress queue.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Could not report skipped video to Redis: {e}", exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
@task(trigger_rule='one_failed')
|
@task(trigger_rule='one_failed')
|
||||||
def report_failure_and_continue(**context):
|
def report_failure_and_continue(**context):
|
||||||
"""
|
"""
|
||||||
@ -732,8 +857,8 @@ def report_failure_and_continue(**context):
|
|||||||
error_details = {}
|
error_details = {}
|
||||||
|
|
||||||
# Check for error details from get_token tasks
|
# Check for error details from get_token tasks
|
||||||
first_token_task_id = 'get_token'
|
first_token_task_id = 'initial_attempt.get_token'
|
||||||
retry_token_task_id = 'retry_get_token'
|
retry_token_task_id = 'retry_logic.retry_get_token'
|
||||||
|
|
||||||
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
|
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
|
||||||
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
|
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
|
||||||
@ -798,8 +923,8 @@ def handle_fatal_error(**context):
|
|||||||
|
|
||||||
# Collect error details
|
# Collect error details
|
||||||
error_details = {}
|
error_details = {}
|
||||||
first_token_task_id = 'get_token'
|
first_token_task_id = 'initial_attempt.get_token'
|
||||||
retry_token_task_id = 'retry_get_token'
|
retry_token_task_id = 'retry_logic.retry_get_token'
|
||||||
|
|
||||||
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
|
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
|
||||||
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
|
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
|
||||||
@ -866,6 +991,12 @@ def continue_processing_loop(**context):
|
|||||||
logger.info(f"DAG run '{dag_run.run_id}' does not appear to be triggered by the dispatcher. Stopping processing loop.")
|
logger.info(f"DAG run '{dag_run.run_id}' does not appear to be triggered by the dispatcher. Stopping processing loop.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
dispatcher_dag_id = 'ytdlp_ops_v02_dispatcher_auth'
|
||||||
|
dag_model = DagModel.get_dagmodel(dispatcher_dag_id)
|
||||||
|
if dag_model and dag_model.is_paused:
|
||||||
|
logger.warning(f"Dispatcher DAG '{dispatcher_dag_id}' is paused. Stopping processing loop.")
|
||||||
|
return
|
||||||
|
|
||||||
# Create a new unique run_id for the dispatcher.
|
# Create a new unique run_id for the dispatcher.
|
||||||
# Using a timestamp and UUID ensures the ID is unique and does not grow in length over time,
|
# Using a timestamp and UUID ensures the ID is unique and does not grow in length over time,
|
||||||
# preventing database errors.
|
# preventing database errors.
|
||||||
@ -880,7 +1011,7 @@ def continue_processing_loop(**context):
|
|||||||
|
|
||||||
logger.info(f"Worker finished successfully. Triggering a new dispatcher ('{new_dispatcher_run_id}') to continue the loop.")
|
logger.info(f"Worker finished successfully. Triggering a new dispatcher ('{new_dispatcher_run_id}') to continue the loop.")
|
||||||
trigger_dag(
|
trigger_dag(
|
||||||
dag_id='ytdlp_ops_v02_dispatcher_auth',
|
dag_id=dispatcher_dag_id,
|
||||||
run_id=new_dispatcher_run_id,
|
run_id=new_dispatcher_run_id,
|
||||||
conf=conf_to_pass,
|
conf=conf_to_pass,
|
||||||
replace_microseconds=False
|
replace_microseconds=False
|
||||||
@ -902,10 +1033,15 @@ def handle_retry_failure_branch(task_id_to_check: str, **context):
|
|||||||
error_message = error_details.get('error_message', '').strip()
|
error_message = error_details.get('error_message', '').strip()
|
||||||
error_code = error_details.get('error_code', '').strip()
|
error_code = error_details.get('error_code', '').strip()
|
||||||
|
|
||||||
# Check if this is an age confirmation error - should not stop the loop
|
# Unrecoverable video errors that should not be retried or treated as system failures.
|
||||||
if "Sign in to confirm your age" in error_message or "confirm your age" in error_message.lower():
|
unrecoverable_video_errors = [
|
||||||
logger.info(f"Age confirmation error detected on retry from '{task_id_to_check}'. This is a content restriction, not a bot detection issue.")
|
"AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "COPYRIGHT_REMOVAL",
|
||||||
return 'handle_age_restriction_error'
|
"GEO_RESTRICTED", "VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED"
|
||||||
|
]
|
||||||
|
|
||||||
|
if error_code in unrecoverable_video_errors:
|
||||||
|
logger.warning(f"Unrecoverable video error '{error_code}' detected on retry for '{task_id_to_check}'.")
|
||||||
|
return 'handle_unrecoverable_video_error'
|
||||||
|
|
||||||
if error_code == 'TRANSPORT_ERROR':
|
if error_code == 'TRANSPORT_ERROR':
|
||||||
logger.error(f"Fatal Thrift connection error on retry from '{task_id_to_check}'.")
|
logger.error(f"Fatal Thrift connection error on retry from '{task_id_to_check}'.")
|
||||||
@ -964,8 +1100,8 @@ def report_bannable_and_continue(**context):
|
|||||||
|
|
||||||
# Collect error details
|
# Collect error details
|
||||||
error_details = {}
|
error_details = {}
|
||||||
first_token_task_id = 'get_token'
|
first_token_task_id = 'initial_attempt.get_token'
|
||||||
retry_token_task_id = 'retry_get_token'
|
retry_token_task_id = 'retry_logic.retry_get_token'
|
||||||
|
|
||||||
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
|
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
|
||||||
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
|
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
|
||||||
@ -1014,71 +1150,6 @@ def report_bannable_and_continue(**context):
|
|||||||
logger.error(f"Could not report bannable error to Redis: {e}", exc_info=True)
|
logger.error(f"Could not report bannable error to Redis: {e}", exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
@task
|
|
||||||
def handle_age_restriction_error(**context):
|
|
||||||
"""
|
|
||||||
Handles age restriction errors specifically. These are content restrictions
|
|
||||||
that cannot be bypassed by using different accounts, so we report the failure
|
|
||||||
and continue the processing loop rather than stopping it.
|
|
||||||
"""
|
|
||||||
params = context['params']
|
|
||||||
ti = context['task_instance']
|
|
||||||
url = params.get('url_to_process', 'unknown')
|
|
||||||
|
|
||||||
# Collect error details
|
|
||||||
error_details = {}
|
|
||||||
first_token_task_id = 'get_token'
|
|
||||||
retry_token_task_id = 'retry_get_token'
|
|
||||||
|
|
||||||
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
|
|
||||||
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
|
|
||||||
|
|
||||||
# Use the most recent error details
|
|
||||||
if retry_token_error:
|
|
||||||
error_details = retry_token_error
|
|
||||||
elif first_token_error:
|
|
||||||
error_details = first_token_error
|
|
||||||
|
|
||||||
logger.error(f"Age restriction error for URL '{url}'. This content requires age confirmation and cannot be bypassed.")
|
|
||||||
|
|
||||||
# Report failure to Redis so the URL can be marked as failed
|
|
||||||
try:
|
|
||||||
client = _get_redis_client(params['redis_conn_id'])
|
|
||||||
|
|
||||||
# Update client-specific stats
|
|
||||||
try:
|
|
||||||
machine_id = params.get('machine_id') or socket.gethostname()
|
|
||||||
_update_client_stats(client, params.get('clients', ''), 'failure', url, machine_id, context['dag_run'].run_id)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Could not update client stats on age restriction error: {e}", exc_info=True)
|
|
||||||
|
|
||||||
result_data = {
|
|
||||||
'status': 'failed',
|
|
||||||
'end_time': time.time(),
|
|
||||||
'url': url,
|
|
||||||
'dag_run_id': context['dag_run'].run_id,
|
|
||||||
'error': 'age_restriction',
|
|
||||||
'error_message': 'Content requires age confirmation',
|
|
||||||
'error_details': error_details
|
|
||||||
}
|
|
||||||
result_queue = f"{params['queue_name']}_result"
|
|
||||||
fail_queue = f"{params['queue_name']}_fail"
|
|
||||||
|
|
||||||
progress_queue = f"{params['queue_name']}_progress"
|
|
||||||
|
|
||||||
with client.pipeline() as pipe:
|
|
||||||
pipe.hset(result_queue, url, json.dumps(result_data))
|
|
||||||
pipe.hset(fail_queue, url, json.dumps(result_data))
|
|
||||||
pipe.hdel(progress_queue, url)
|
|
||||||
pipe.execute()
|
|
||||||
|
|
||||||
logger.info(f"Stored age restriction error for URL '{url}' in '{result_queue}' and '{fail_queue}'.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Could not report age restriction error to Redis: {e}", exc_info=True)
|
|
||||||
|
|
||||||
# This is NOT a fatal error for the processing loop - we just continue with the next URL
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# DAG Definition with TaskGroups
|
# DAG Definition with TaskGroups
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@ -1106,9 +1177,10 @@ with DAG(
|
|||||||
'timeout': Param(DEFAULT_TIMEOUT, type="integer"),
|
'timeout': Param(DEFAULT_TIMEOUT, type="integer"),
|
||||||
'on_bannable_failure': Param('stop_loop_on_auth_proceed_on_download_error', type="string", enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error', 'proceed_loop_under_manual_inspection', 'stop_loop_on_auth_proceed_on_download_error']),
|
'on_bannable_failure': Param('stop_loop_on_auth_proceed_on_download_error', type="string", enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error', 'proceed_loop_under_manual_inspection', 'stop_loop_on_auth_proceed_on_download_error']),
|
||||||
'request_params_json': Param(json.dumps(DEFAULT_REQUEST_PARAMS), type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."),
|
'request_params_json': Param(json.dumps(DEFAULT_REQUEST_PARAMS), type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."),
|
||||||
|
'language_code': Param('en-US', type="string", title="[Worker Param] Language Code", description="The language code (e.g., 'en-US', 'de-DE') to use for the YouTube request headers."),
|
||||||
'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean"),
|
'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean"),
|
||||||
# --- Manual Run / Internal Parameters ---
|
# --- Manual Run / Internal Parameters ---
|
||||||
'manual_url_to_process': Param('iPwdia3gAnk', type=["string", "null"], title="[Manual Run] URL to Process", description="For manual runs, provide a single YouTube URL to process. This is ignored if triggered by the dispatcher."),
|
'manual_url_to_process': Param('iPwdia3gAnk', type=["string", "null"], title="[Manual Run] URL to Process", description="For manual runs, provide a single YouTube URL, or the special value 'PULL_FROM_QUEUE' to pull one URL from the Redis inbox. This is ignored if triggered by the dispatcher."),
|
||||||
'url_to_process': Param(None, type=["string", "null"], title="[Internal] URL from Dispatcher", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
|
'url_to_process': Param(None, type=["string", "null"], title="[Internal] URL from Dispatcher", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
|
||||||
'worker_queue': Param(None, type=["string", "null"], title="[Internal] Worker Queue", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
|
'worker_queue': Param(None, type=["string", "null"], title="[Internal] Worker Queue", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
|
||||||
}
|
}
|
||||||
@ -1121,7 +1193,7 @@ with DAG(
|
|||||||
fatal_error_task = handle_fatal_error()
|
fatal_error_task = handle_fatal_error()
|
||||||
report_failure_task = report_failure_and_continue()
|
report_failure_task = report_failure_and_continue()
|
||||||
continue_loop_task = continue_processing_loop()
|
continue_loop_task = continue_processing_loop()
|
||||||
age_restriction_task = handle_age_restriction_error()
|
unrecoverable_video_error_task = handle_unrecoverable_video_error()
|
||||||
report_bannable_and_continue_task = report_bannable_and_continue()
|
report_bannable_and_continue_task = report_bannable_and_continue()
|
||||||
|
|
||||||
# --- Task Group 1: Initial Attempt ---
|
# --- Task Group 1: Initial Attempt ---
|
||||||
@ -1138,7 +1210,7 @@ with DAG(
|
|||||||
)
|
)
|
||||||
|
|
||||||
first_token_attempt >> initial_branch_task
|
first_token_attempt >> initial_branch_task
|
||||||
initial_branch_task >> [fatal_error_task, ban_and_report_immediately_task, age_restriction_task, report_bannable_and_continue_task]
|
initial_branch_task >> [fatal_error_task, ban_and_report_immediately_task, unrecoverable_video_error_task, report_bannable_and_continue_task]
|
||||||
|
|
||||||
# --- Task Group 2: Retry Logic ---
|
# --- Task Group 2: Retry Logic ---
|
||||||
with TaskGroup("retry_logic", tooltip="Retry logic with account management") as retry_logic_group:
|
with TaskGroup("retry_logic", tooltip="Retry logic with account management") as retry_logic_group:
|
||||||
@ -1188,7 +1260,7 @@ with DAG(
|
|||||||
direct_retry_account_task >> coalesced_retry_data
|
direct_retry_account_task >> coalesced_retry_data
|
||||||
coalesced_retry_data >> retry_token_task
|
coalesced_retry_data >> retry_token_task
|
||||||
retry_token_task >> retry_branch_task
|
retry_token_task >> retry_branch_task
|
||||||
retry_branch_task >> [fatal_error_task, report_failure_task, ban_after_retry_report_task, age_restriction_task, report_bannable_and_continue_task]
|
retry_branch_task >> [fatal_error_task, report_failure_task, ban_after_retry_report_task, unrecoverable_video_error_task, report_bannable_and_continue_task]
|
||||||
ban_after_retry_report_task >> report_failure_task
|
ban_after_retry_report_task >> report_failure_task
|
||||||
|
|
||||||
# --- Task Group 3: Success/Continuation Logic ---
|
# --- Task Group 3: Success/Continuation Logic ---
|
||||||
@ -1210,7 +1282,7 @@ with DAG(
|
|||||||
|
|
||||||
# --- DAG Dependencies between TaskGroups ---
|
# --- DAG Dependencies between TaskGroups ---
|
||||||
# Initial attempt can lead to retry logic or direct failure
|
# Initial attempt can lead to retry logic or direct failure
|
||||||
initial_branch_task >> [retry_logic_group, fatal_error_task, ban_and_report_immediately_task, age_restriction_task, report_bannable_and_continue_task]
|
initial_branch_task >> [retry_logic_group, fatal_error_task, ban_and_report_immediately_task, unrecoverable_video_error_task, report_bannable_and_continue_task]
|
||||||
|
|
||||||
# A successful initial attempt bypasses retry and goes straight to the success group
|
# A successful initial attempt bypasses retry and goes straight to the success group
|
||||||
initial_attempt_group >> success_group
|
initial_attempt_group >> success_group
|
||||||
@ -1222,6 +1294,6 @@ with DAG(
|
|||||||
# Ban and report immediately leads to failure reporting
|
# Ban and report immediately leads to failure reporting
|
||||||
ban_and_report_immediately_task >> report_failure_task
|
ban_and_report_immediately_task >> report_failure_task
|
||||||
|
|
||||||
# Age restriction error leads to failure reporting and continues the loop
|
# Unrecoverable/bannable errors that don't stop the loop should continue processing
|
||||||
age_restriction_task >> continue_loop_task
|
unrecoverable_video_error_task >> continue_loop_task
|
||||||
report_bannable_and_continue_task >> continue_loop_task
|
report_bannable_and_continue_task >> continue_loop_task
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -41,17 +41,24 @@ def run_s3_upload_batch(**context):
|
|||||||
Dry run mode is non-destructive and will pause briefly after checking to prevent tight loops.
|
Dry run mode is non-destructive and will pause briefly after checking to prevent tight loops.
|
||||||
"""
|
"""
|
||||||
params = context['params']
|
params = context['params']
|
||||||
|
ti = context['task_instance']
|
||||||
|
|
||||||
|
# Log the configured execution timeout for debugging purposes.
|
||||||
|
# This helps verify that the timeout setting from the DAG file is being applied.
|
||||||
|
timeout_delta = ti.task.execution_timeout
|
||||||
|
logger.info(f"Task is configured with execution_timeout: {timeout_delta}")
|
||||||
|
|
||||||
concurrency = params['concurrency']
|
concurrency = params['concurrency']
|
||||||
mode = params['mode']
|
mode = params['mode']
|
||||||
dry_run = params['dry_run']
|
dry_run = params['dry_run']
|
||||||
sleep_interval_min = params['sleep_if_no_videos_min']
|
sleep_interval_min = params['sleep_if_no_videos_min']
|
||||||
sleep_interval_sec = sleep_interval_min * 60
|
sleep_interval_sec = sleep_interval_min * 60
|
||||||
s3_conn_id = params['s3_conn_id']
|
s3_conn_id = params['s3_conn_id']
|
||||||
|
s3_bucket = params['s3_bucket_name']
|
||||||
|
|
||||||
s3_access_key_id = None
|
s3_access_key_id = None
|
||||||
s3_secret_access_key = None
|
s3_secret_access_key = None
|
||||||
s3_endpoint = None
|
s3_endpoint = None
|
||||||
s3_bucket = None
|
|
||||||
s3_region = None
|
s3_region = None
|
||||||
config_source = "Unknown"
|
config_source = "Unknown"
|
||||||
profile_name = "rusonyx"
|
profile_name = "rusonyx"
|
||||||
@ -68,12 +75,11 @@ def run_s3_upload_batch(**context):
|
|||||||
s3_endpoint = s3_conn.host
|
s3_endpoint = s3_conn.host
|
||||||
|
|
||||||
extra_config = s3_conn.extra_dejson
|
extra_config = s3_conn.extra_dejson
|
||||||
s3_bucket = extra_config.get('bucket')
|
|
||||||
s3_region = extra_config.get('region_name')
|
s3_region = extra_config.get('region_name')
|
||||||
|
|
||||||
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_bucket, s3_region]):
|
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_region]):
|
||||||
logger.warning("S3 connection from Airflow is missing one or more required fields. Will attempt to fall back to environment variables.")
|
logger.warning("S3 connection from Airflow is missing one or more required fields (excluding bucket). Will attempt to fall back to environment variables.")
|
||||||
s3_access_key_id = s3_secret_access_key = s3_endpoint = s3_bucket = s3_region = None # Reset all
|
s3_access_key_id = s3_secret_access_key = s3_endpoint = s3_region = None # Reset all
|
||||||
else:
|
else:
|
||||||
config_source = f"Airflow Connection '{s3_conn_id}'"
|
config_source = f"Airflow Connection '{s3_conn_id}'"
|
||||||
profile_name = "rusonyx-airflow"
|
profile_name = "rusonyx-airflow"
|
||||||
@ -82,17 +88,16 @@ def run_s3_upload_batch(**context):
|
|||||||
logger.warning(f"Failed to load S3 configuration from Airflow connection '{s3_conn_id}': {e}. Will attempt to fall back to environment variables.")
|
logger.warning(f"Failed to load S3 configuration from Airflow connection '{s3_conn_id}': {e}. Will attempt to fall back to environment variables.")
|
||||||
|
|
||||||
# --- Attempt 2: Fallback to Environment Variables ---
|
# --- Attempt 2: Fallback to Environment Variables ---
|
||||||
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_bucket, s3_region]):
|
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_region]):
|
||||||
try:
|
try:
|
||||||
logger.info("Attempting to load S3 configuration from environment variables as a fallback.")
|
logger.info("Attempting to load S3 configuration from environment variables as a fallback.")
|
||||||
s3_access_key_id = os.environ['S3_DELIVERY_AWS_ACCESS_KEY_ID']
|
s3_access_key_id = os.environ['S3_DELIVERY_AWS_ACCESS_KEY_ID']
|
||||||
s3_secret_access_key = os.environ['S3_DELIVERY_AWS_SECRET_ACCESS_KEY']
|
s3_secret_access_key = os.environ['S3_DELIVERY_AWS_SECRET_ACCESS_KEY']
|
||||||
s3_endpoint = os.environ['S3_DELIVERY_ENDPOINT']
|
s3_endpoint = os.environ['S3_DELIVERY_ENDPOINT']
|
||||||
s3_bucket = os.environ['S3_DELIVERY_BUCKET']
|
|
||||||
s3_region = os.environ['S3_DELIVERY_AWS_REGION']
|
s3_region = os.environ['S3_DELIVERY_AWS_REGION']
|
||||||
|
|
||||||
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_bucket, s3_region]):
|
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_region]):
|
||||||
raise ValueError("One or more S3 configuration environment variables are empty.")
|
raise ValueError("One or more S3 configuration environment variables are empty (excluding bucket).")
|
||||||
config_source = "Environment Variables"
|
config_source = "Environment Variables"
|
||||||
profile_name = "rusonyx"
|
profile_name = "rusonyx"
|
||||||
|
|
||||||
@ -100,6 +105,9 @@ def run_s3_upload_batch(**context):
|
|||||||
logger.error(f"Having problems reading S3 configuration from environment variables: {e}", exc_info=True)
|
logger.error(f"Having problems reading S3 configuration from environment variables: {e}", exc_info=True)
|
||||||
raise AirflowException("S3 configuration is missing. Could not load from Airflow connection or environment variables.")
|
raise AirflowException("S3 configuration is missing. Could not load from Airflow connection or environment variables.")
|
||||||
|
|
||||||
|
if not s3_bucket:
|
||||||
|
raise AirflowException("S3 bucket name is not specified in DAG parameters.")
|
||||||
|
|
||||||
s3_destination = f"s3://{s3_bucket}/"
|
s3_destination = f"s3://{s3_bucket}/"
|
||||||
|
|
||||||
logger.info(f"Starting S3 upload loop. Watching source '{READY_PATH}' for delivery to '{s3_destination}'.")
|
logger.info(f"Starting S3 upload loop. Watching source '{READY_PATH}' for delivery to '{s3_destination}'.")
|
||||||
@ -328,6 +336,21 @@ with DAG(
|
|||||||
2. Ansible updates an Airflow Variable named `s3_worker_hostnames` with a JSON list of all active uploader workers (typically dlXXX machines). Each worker listens to its own queue (e.g., `queue-dl-dl001`).
|
2. Ansible updates an Airflow Variable named `s3_worker_hostnames` with a JSON list of all active uploader workers (typically dlXXX machines). Each worker listens to its own queue (e.g., `queue-dl-dl001`).
|
||||||
3. This DAG reads the variable on manual trigger or after a pause/resume cycle to create the dynamic tasks. This allows for easy inspection of per-worker logs and status from the Airflow UI.
|
3. This DAG reads the variable on manual trigger or after a pause/resume cycle to create the dynamic tasks. This allows for easy inspection of per-worker logs and status from the Airflow UI.
|
||||||
4. Each dynamic task watches a shared folder (`/opt/airflow/downloadfiles/videos/ready`). Download workers place completed videos into timestamped sub-folders (e.g., `20241122T1050`). The uploader processes these 10-minute batches, copying them to S3 with `s5cmd` and then deleting the source directories. This design avoids race conditions and improves performance.
|
4. Each dynamic task watches a shared folder (`/opt/airflow/downloadfiles/videos/ready`). Download workers place completed videos into timestamped sub-folders (e.g., `20241122T1050`). The uploader processes these 10-minute batches, copying them to S3 with `s5cmd` and then deleting the source directories. This design avoids race conditions and improves performance.
|
||||||
|
|
||||||
|
#### Why use 10-minute batch folders?
|
||||||
|
|
||||||
|
While an `mv` command (atomic on the same filesystem) is sufficient to ensure a single video directory is complete when it appears in the `ready` folder, the batching system solves higher-level concurrency and efficiency problems in a high-throughput environment.
|
||||||
|
|
||||||
|
- **Concurrency Management**: The uploader needs to process a discrete *set* of videos. By working on batches from a *previous* time window (e.g., uploading the `10:40` batch after `10:50`), it guarantees that no new files will be added to that batch while it's being processed. This creates a clean, reliable unit of work and prevents the uploader from missing videos that are moved in while it's compiling its list.
|
||||||
|
- **Bulk Operation Efficiency**: It is far more efficient to upload hundreds of videos in a single bulk command than one by one. The batching system allows videos to accumulate, and the uploader sends them all to S3 in one highly optimized `s5cmd run` command. Similarly, after a successful upload, the uploader can delete the single parent batch directory, which is much faster than deleting hundreds of individual video folders.
|
||||||
|
- **Continuous Operation**: The uploader task is a long-running loop. If processing a batch takes longer than 10 minutes (e.g., due to a large volume of videos or slow network), the uploader will continue working on that batch until it is complete. It only sleeps when it has processed all available completed batches and is waiting for new ones to become ready.
|
||||||
|
|
||||||
|
#### Cleanup Method: `rsync` vs `shutil.rmtree`
|
||||||
|
|
||||||
|
The cleanup process uses the `rsync` empty-folder trick to delete the contents of the batch directory before removing the directory itself. This is a deliberate performance optimization. The command is effectively: `rsync -a --delete /path/to/empty/ /path/to/delete/`.
|
||||||
|
|
||||||
|
- Python's `shutil.rmtree` can be slow as it makes an individual `os.remove()` system call for every file.
|
||||||
|
- The `rsync` method is a well-known and highly efficient alternative for this scenario, as `rsync` is a mature C program optimized for these operations. More details on this performance difference can be found here: https://stackoverflow.com/questions/5470939/why-is-shutil-rmtree-so-slow
|
||||||
""",
|
""",
|
||||||
params={
|
params={
|
||||||
'mode': Param(
|
'mode': Param(
|
||||||
@ -339,9 +362,15 @@ with DAG(
|
|||||||
description="If True, the DAG will perform all steps except the actual upload and cleanup. `s5cmd` will be run with `--dry-run`, and the final directory removal will be skipped. Log messages will indicate what would have happened."
|
description="If True, the DAG will perform all steps except the actual upload and cleanup. `s5cmd` will be run with `--dry-run`, and the final directory removal will be skipped. Log messages will indicate what would have happened."
|
||||||
),
|
),
|
||||||
'concurrency': Param(10, type="integer", title="s5cmd Concurrency"),
|
'concurrency': Param(10, type="integer", title="s5cmd Concurrency"),
|
||||||
'sleep_if_no_videos_min': Param(10, type="integer", title="Sleep if Idle (minutes)", description="How many minutes the task should sleep if no videos are found to upload."),
|
'sleep_if_no_videos_min': Param(5, type="integer", title="Sleep if Idle (minutes)", description="How many minutes the task should sleep if no videos are found to upload. This should be less than any external timeout (e.g., Celery's worker_proc_timeout)."),
|
||||||
'batch_completion_wait_min': Param(0, type="integer", title="Batch Completion Wait (minutes)", description="How many minutes to wait after a 10-minute batch window closes before considering it for upload. Default is 0, which processes the current batch immediately. A value of 10 restores the old behavior of waiting for the next 10-minute window."),
|
'batch_completion_wait_min': Param(0, type="integer", title="Batch Completion Wait (minutes)", description="How many minutes to wait after a 10-minute batch window closes before considering it for upload. Default is 0, which processes the current batch immediately. A value of 10 restores the old behavior of waiting for the next 10-minute window."),
|
||||||
's3_conn_id': Param('s3_delivery_connection', type="string", title="S3 Connection ID", description="The Airflow connection ID for the S3-compatible storage. If this connection is invalid or missing, the task will fall back to environment variables."),
|
's3_conn_id': Param('s3_delivery_connection', type="string", title="S3 Connection ID", description="The Airflow connection ID for the S3-compatible storage. If this connection is invalid or missing, the task will fall back to environment variables."),
|
||||||
|
's3_bucket_name': Param(
|
||||||
|
'videos',
|
||||||
|
type="string",
|
||||||
|
title="S3 Bucket Name",
|
||||||
|
description="The name of the S3 bucket to upload to. Common values are 'videos' or 'videos-prod'."
|
||||||
|
),
|
||||||
}
|
}
|
||||||
) as dag:
|
) as dag:
|
||||||
|
|
||||||
@ -410,7 +439,8 @@ with DAG(
|
|||||||
# Create a task for each worker, pinned to its specific queue
|
# Create a task for each worker, pinned to its specific queue
|
||||||
upload_task = task(
|
upload_task = task(
|
||||||
task_id=f'upload_batch_on_{task_id_hostname}',
|
task_id=f'upload_batch_on_{task_id_hostname}',
|
||||||
queue=f'queue-s3-{hostname}'
|
queue=f'queue-s3-{hostname}',
|
||||||
|
execution_timeout=timedelta(days=1),
|
||||||
)(run_s3_upload_batch)()
|
)(run_s3_upload_batch)()
|
||||||
worker_tasks.append(upload_task)
|
worker_tasks.append(upload_task)
|
||||||
|
|
||||||
|
|||||||
@ -138,6 +138,7 @@ def generate_configs():
|
|||||||
logging.info(f"Service role for generation: '{service_role}'")
|
logging.info(f"Service role for generation: '{service_role}'")
|
||||||
|
|
||||||
# --- Camoufox Configuration (only for worker/all-in-one roles) ---
|
# --- Camoufox Configuration (only for worker/all-in-one roles) ---
|
||||||
|
logging.info("--- Camoufox (Remote Browser) Configuration ---")
|
||||||
camoufox_proxies = []
|
camoufox_proxies = []
|
||||||
expanded_camoufox_proxies_str = ""
|
expanded_camoufox_proxies_str = ""
|
||||||
if service_role != 'management':
|
if service_role != 'management':
|
||||||
@ -210,7 +211,7 @@ def generate_configs():
|
|||||||
logging.info("This file maps each proxy to a list of WebSocket endpoints for Camoufox.")
|
logging.info("This file maps each proxy to a list of WebSocket endpoints for Camoufox.")
|
||||||
logging.info("The token_generator uses this map to connect to the correct remote browser.")
|
logging.info("The token_generator uses this map to connect to the correct remote browser.")
|
||||||
else:
|
else:
|
||||||
logging.info("Skipping Camoufox configuration generation for 'management' role.")
|
logging.info("Skipping Camoufox configuration generation.")
|
||||||
|
|
||||||
# --- Generate docker-compose-ytdlp-ops.yaml ---
|
# --- Generate docker-compose-ytdlp-ops.yaml ---
|
||||||
ytdlp_ops_template = env.get_template('docker-compose-ytdlp-ops.yaml.j2')
|
ytdlp_ops_template = env.get_template('docker-compose-ytdlp-ops.yaml.j2')
|
||||||
|
|||||||
@ -1,64 +1,46 @@
|
|||||||
# Ansible for YT-DLP Cluster
|
# Ansible Deployment for YT-DLP Cluster
|
||||||
|
|
||||||
This directory contains the Ansible playbooks, roles, and configurations for deploying and managing the YT-DLP Airflow cluster.
|
This document provides an overview of the Ansible playbooks used to deploy and manage the YT-DLP Airflow cluster.
|
||||||
|
|
||||||
**Note:** All commands should be run from the project root, not from within this directory.
|
## Main Playbooks
|
||||||
Example: `ansible-playbook ansible/playbook-full.yml`
|
|
||||||
|
|
||||||
## Full Deployment
|
These are the primary entry points for cluster management.
|
||||||
|
|
||||||
### Deploy entire cluster with proxies (recommended for new setups):
|
- `playbook-full-with-proxies.yml`: **(Recommended Entry Point)** Deploys shadowsocks proxies and then the entire application stack.
|
||||||
|
- `playbook-full.yml`: Deploys the entire application stack (master and workers) without touching proxies.
|
||||||
|
- `playbook-master.yml`: Deploys/updates only the Airflow master node.
|
||||||
|
- `playbook-worker.yml`: Deploys/updates all Airflow worker nodes.
|
||||||
|
- `playbook-proxies.yml`: Deploys/updates only the shadowsocks proxy services on all nodes.
|
||||||
|
|
||||||
```bash
|
## Component & Utility Playbooks
|
||||||
ansible-playbook ansible/playbook-full-with-proxies.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
### Deploy cluster without proxies:
|
These playbooks are used for more specific tasks or are called by the main playbooks.
|
||||||
|
|
||||||
```bash
|
### Core Deployment Logic
|
||||||
ansible-playbook ansible/playbook-full.yml
|
- `roles/airflow-master/tasks/main.yml`: Contains all tasks for setting up the Airflow master services.
|
||||||
```
|
- `roles/airflow-worker/tasks/main.yml`: Contains all tasks for setting up the Airflow worker services.
|
||||||
|
- `roles/ytdlp-master/tasks/main.yml`: Contains tasks for setting up the YT-DLP management services on the master.
|
||||||
|
- `roles/ytdlp-worker/tasks/main.yml`: Contains tasks for setting up YT-DLP, Camoufox, and other worker-specific services.
|
||||||
|
|
||||||
## Targeted Deployments
|
### Utility & Maintenance
|
||||||
|
- `playbook-dags.yml`: Quickly syncs only the `dags/` and `config/` directories to all nodes.
|
||||||
|
- `playbook-hook.yml`: Syncs Airflow custom hooks and restarts relevant services.
|
||||||
|
- `playbook-sync-local.yml`: Syncs local development files (e.g., `ytops_client`, `pangramia`) to workers.
|
||||||
|
- `playbooks/pause_worker.yml`: Pauses a worker by creating a lock file, preventing it from taking new tasks.
|
||||||
|
- `playbooks/resume_worker.yml`: Resumes a paused worker by removing the lock file.
|
||||||
|
- `playbooks/playbook-bgutils-start.yml`: Starts the `bgutil-provider` container.
|
||||||
|
- `playbooks/playbook-bgutils-stop.yml`: Stops the `bgutil-provider` container.
|
||||||
|
- `playbook-update-s3-vars.yml`: Updates the `s3_delivery_connection` in Airflow.
|
||||||
|
- `playbook-update-regression-script.yml`: Updates the `regression.py` script on the master.
|
||||||
|
|
||||||
### Deploy only to master node:
|
### Deprecated
|
||||||
|
- `playbook-dl.yml`: Older worker deployment logic. Superseded by `playbook-worker.yml`.
|
||||||
|
- `playbook-depricated.dl.yml`: Older worker deployment logic. Superseded by `playbook-worker.yml`.
|
||||||
|
|
||||||
```bash
|
## Current Goal: Disable Camoufox & Enable Aria2
|
||||||
ansible-playbook ansible/playbook-master.yml --limit="af-test"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Deploy only to worker nodes:
|
The current objective is to modify the worker deployment (`playbook-worker.yml` and its role `roles/ytdlp-worker/tasks/main.yml`) to:
|
||||||
|
1. **Disable Camoufox**: Prevent the build, configuration generation, and startup of all `camoufox` services.
|
||||||
|
2. **Enable Aria2**: Ensure the `aria2-pro` service is built and started correctly on worker nodes.
|
||||||
|
|
||||||
```bash
|
The `playbook-worker.yml` has already been updated to build the `aria2-pro` image. The next steps will involve modifying `roles/ytdlp-worker/tasks/main.yml` to remove the Camoufox-related tasks.
|
||||||
ansible-playbook ansible/playbook-worker.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
## DAGs Only Deployment
|
|
||||||
|
|
||||||
To update only DAG files and configurations:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-playbook ansible/playbook-dags.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
## Managing Worker State (Pause/Resume)
|
|
||||||
|
|
||||||
The system allows for gracefully pausing a worker to prevent it from picking up new tasks. This is useful for maintenance or decommissioning a node. The mechanism uses a lock file (`AIRFLOW.PREVENT_URL_PULL.lock`) on the worker host.
|
|
||||||
|
|
||||||
### To Pause a Worker
|
|
||||||
|
|
||||||
This command creates the lock file, causing the `ytdlp_ops_dispatcher` DAG to skip task execution on this host.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Replace "worker-hostname" with the target host from your inventory
|
|
||||||
ansible-playbook ansible/playbooks/pause_worker.yml --limit "worker-hostname"
|
|
||||||
```
|
|
||||||
|
|
||||||
### To Resume a Worker
|
|
||||||
|
|
||||||
This command removes the lock file, allowing the worker to resume picking up tasks.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Replace "worker-hostname" with the target host from your inventory
|
|
||||||
ansible-playbook ansible/playbooks/resume_worker.yml --limit "worker-hostname"
|
|
||||||
```
|
|
||||||
|
|||||||
@ -13,3 +13,4 @@ vault_s3_delivery_secret_access_key: "33b155c5d2ea4fccb0faeeefb420d7ac"
|
|||||||
vault_s3_delivery_endpoint: "https://s3.rusonyxcloud.ru"
|
vault_s3_delivery_endpoint: "https://s3.rusonyxcloud.ru"
|
||||||
vault_s3_delivery_bucket: "videos"
|
vault_s3_delivery_bucket: "videos"
|
||||||
vault_s3_delivery_aws_region: "ru-msk"
|
vault_s3_delivery_aws_region: "ru-msk"
|
||||||
|
vault_aria2_rpc_secret: "aR1a2_sEcReT_pWd_f0r_yTd1p"
|
||||||
|
|||||||
@ -11,7 +11,7 @@
|
|||||||
src: "../airflow/dags/"
|
src: "../airflow/dags/"
|
||||||
dest: /srv/airflow_master/dags/
|
dest: /srv/airflow_master/dags/
|
||||||
archive: yes
|
archive: yes
|
||||||
delete: yes
|
delete: no
|
||||||
rsync_path: "sudo rsync"
|
rsync_path: "sudo rsync"
|
||||||
rsync_opts:
|
rsync_opts:
|
||||||
- "--exclude=__pycache__/"
|
- "--exclude=__pycache__/"
|
||||||
@ -42,7 +42,7 @@
|
|||||||
src: "../airflow/dags/"
|
src: "../airflow/dags/"
|
||||||
dest: /srv/airflow_dl_worker/dags/
|
dest: /srv/airflow_dl_worker/dags/
|
||||||
archive: yes
|
archive: yes
|
||||||
delete: yes
|
delete: no
|
||||||
rsync_path: "sudo rsync"
|
rsync_path: "sudo rsync"
|
||||||
rsync_opts:
|
rsync_opts:
|
||||||
- "--exclude=__pycache__/"
|
- "--exclude=__pycache__/"
|
||||||
|
|||||||
@ -91,7 +91,6 @@
|
|||||||
files:
|
files:
|
||||||
- configs/docker-compose-dl.yaml
|
- configs/docker-compose-dl.yaml
|
||||||
- configs/docker-compose-ytdlp-ops.yaml
|
- configs/docker-compose-ytdlp-ops.yaml
|
||||||
- configs/docker-compose.camoufox.yaml
|
|
||||||
state: present
|
state: present
|
||||||
remove_orphans: true
|
remove_orphans: true
|
||||||
pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"
|
pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"
|
||||||
|
|||||||
@ -216,6 +216,17 @@
|
|||||||
become: yes
|
become: yes
|
||||||
become_user: "{{ ansible_user }}"
|
become_user: "{{ ansible_user }}"
|
||||||
|
|
||||||
|
- name: Sync aria2-pro-docker to worker for build context
|
||||||
|
ansible.posix.synchronize:
|
||||||
|
src: "../airflow/aria2-pro-docker/"
|
||||||
|
dest: "{{ airflow_worker_dir }}/aria2-pro-docker/"
|
||||||
|
rsync_opts:
|
||||||
|
- "--delete"
|
||||||
|
recursive: yes
|
||||||
|
perms: yes
|
||||||
|
become: yes
|
||||||
|
become_user: "{{ ansible_user }}"
|
||||||
|
|
||||||
- name: Ensure bin directory exists on worker for build context
|
- name: Ensure bin directory exists on worker for build context
|
||||||
ansible.builtin.file:
|
ansible.builtin.file:
|
||||||
path: "{{ airflow_worker_dir }}/bin"
|
path: "{{ airflow_worker_dir }}/bin"
|
||||||
@ -275,15 +286,6 @@
|
|||||||
- name: Include Docker health check tasks
|
- name: Include Docker health check tasks
|
||||||
include_tasks: tasks/docker_health_check.yml
|
include_tasks: tasks/docker_health_check.yml
|
||||||
|
|
||||||
- name: Build local Docker images (e.g., camoufox)
|
|
||||||
ansible.builtin.command: >
|
|
||||||
docker compose --project-directory . -f configs/docker-compose-ytdlp-ops.yaml build
|
|
||||||
args:
|
|
||||||
chdir: "{{ airflow_worker_dir }}"
|
|
||||||
become: yes
|
|
||||||
become_user: "{{ ansible_user }}"
|
|
||||||
register: docker_build_result
|
|
||||||
changed_when: "'Building' in docker_build_result.stdout or 'writing image' in docker_build_result.stdout"
|
|
||||||
|
|
||||||
- name: Pull pre-built Docker images for ytdlp-ops services
|
- name: Pull pre-built Docker images for ytdlp-ops services
|
||||||
ansible.builtin.command: >
|
ansible.builtin.command: >
|
||||||
|
|||||||
@ -47,7 +47,6 @@
|
|||||||
- "docker-compose-ytdlp-ops.yaml.j2"
|
- "docker-compose-ytdlp-ops.yaml.j2"
|
||||||
- "docker-compose.config-generate.yaml"
|
- "docker-compose.config-generate.yaml"
|
||||||
- "envoy.yaml.j2"
|
- "envoy.yaml.j2"
|
||||||
- "docker-compose.camoufox.yaml.j2"
|
|
||||||
|
|
||||||
- name: Create .env file for YT-DLP master service
|
- name: Create .env file for YT-DLP master service
|
||||||
template:
|
template:
|
||||||
@ -117,19 +116,6 @@
|
|||||||
recurse: yes
|
recurse: yes
|
||||||
become: yes
|
become: yes
|
||||||
|
|
||||||
- name: Create dummy camoufox compose file for master to prevent errors
|
|
||||||
copy:
|
|
||||||
content: |
|
|
||||||
# This is a placeholder file.
|
|
||||||
# The master node does not run Camoufox, but the shared docker-compose-ytdlp-ops.yaml
|
|
||||||
# may unconditionally include this file, causing an error if it's missing.
|
|
||||||
# This file provides an empty services block to satisfy the include.
|
|
||||||
services: {}
|
|
||||||
dest: "{{ airflow_master_dir }}/configs/docker-compose.camoufox.yaml"
|
|
||||||
mode: "{{ file_permissions }}"
|
|
||||||
owner: "{{ ssh_user }}"
|
|
||||||
group: "{{ deploy_group }}"
|
|
||||||
become: yes
|
|
||||||
|
|
||||||
- name: Check for shadowsocks-rust proxy compose file
|
- name: Check for shadowsocks-rust proxy compose file
|
||||||
stat:
|
stat:
|
||||||
|
|||||||
@ -66,18 +66,7 @@
|
|||||||
|
|
||||||
- name: "Log: Syncing YT-DLP service files"
|
- name: "Log: Syncing YT-DLP service files"
|
||||||
debug:
|
debug:
|
||||||
msg: "Syncing YT-DLP service components (config generator, envoy/camoufox templates) to the worker node."
|
msg: "Syncing YT-DLP service components (config generator, envoy templates) to the worker node."
|
||||||
|
|
||||||
- name: Sync YT-DLP service files to worker
|
|
||||||
synchronize:
|
|
||||||
src: "../{{ item }}"
|
|
||||||
dest: "{{ airflow_worker_dir }}/"
|
|
||||||
archive: yes
|
|
||||||
recursive: yes
|
|
||||||
rsync_path: "sudo rsync"
|
|
||||||
rsync_opts: "{{ rsync_default_opts }}"
|
|
||||||
loop:
|
|
||||||
- "airflow/camoufox"
|
|
||||||
|
|
||||||
- name: Sync YT-DLP config generator to worker
|
- name: Sync YT-DLP config generator to worker
|
||||||
synchronize:
|
synchronize:
|
||||||
@ -99,7 +88,6 @@
|
|||||||
- "docker-compose-ytdlp-ops.yaml.j2"
|
- "docker-compose-ytdlp-ops.yaml.j2"
|
||||||
- "docker-compose.config-generate.yaml"
|
- "docker-compose.config-generate.yaml"
|
||||||
- "envoy.yaml.j2"
|
- "envoy.yaml.j2"
|
||||||
- "docker-compose.camoufox.yaml.j2"
|
|
||||||
|
|
||||||
- name: Sync Airflow build context to worker
|
- name: Sync Airflow build context to worker
|
||||||
synchronize:
|
synchronize:
|
||||||
@ -209,19 +197,35 @@
|
|||||||
force_source: true
|
force_source: true
|
||||||
when: not fast_deploy | default(false)
|
when: not fast_deploy | default(false)
|
||||||
|
|
||||||
- name: "Log: Building Camoufox (remote browser) image"
|
- name: "Log: Building aria2-pro image"
|
||||||
debug:
|
debug:
|
||||||
msg: "Building the Camoufox image locally. This image provides remote-controlled Firefox browsers for token generation."
|
msg: "Building the aria2-pro image locally. This image provides the download manager."
|
||||||
|
|
||||||
- name: Build Camoufox image from local Dockerfile
|
|
||||||
community.docker.docker_image:
|
|
||||||
name: "camoufox:latest"
|
|
||||||
build:
|
|
||||||
path: "{{ airflow_worker_dir }}/camoufox"
|
|
||||||
source: build
|
|
||||||
force_source: true
|
|
||||||
when: not fast_deploy | default(false)
|
when: not fast_deploy | default(false)
|
||||||
|
|
||||||
|
- name: Build aria2-pro image from docker-compose
|
||||||
|
ansible.builtin.command: >
|
||||||
|
docker compose -f configs/docker-compose.airflow.yml build aria2-pro
|
||||||
|
args:
|
||||||
|
chdir: "{{ airflow_worker_dir }}"
|
||||||
|
become: yes
|
||||||
|
become_user: "{{ ansible_user }}"
|
||||||
|
register: docker_build_result
|
||||||
|
changed_when: "'Building' in docker_build_result.stdout or 'writing image' in docker_build_result.stdout"
|
||||||
|
when: not fast_deploy | default(false)
|
||||||
|
|
||||||
|
# - name: "Log: Building Camoufox (remote browser) image"
|
||||||
|
# debug:
|
||||||
|
# msg: "Building the Camoufox image locally. This image provides remote-controlled Firefox browsers for token generation."
|
||||||
|
#
|
||||||
|
# - name: Build Camoufox image from local Dockerfile
|
||||||
|
# community.docker.docker_image:
|
||||||
|
# name: "camoufox:latest"
|
||||||
|
# build:
|
||||||
|
# path: "{{ airflow_worker_dir }}/camoufox"
|
||||||
|
# source: build
|
||||||
|
# force_source: true
|
||||||
|
# when: not fast_deploy | default(false)
|
||||||
|
|
||||||
- name: Ensure correct permissions for build context after generation
|
- name: Ensure correct permissions for build context after generation
|
||||||
file:
|
file:
|
||||||
path: "{{ airflow_worker_dir }}"
|
path: "{{ airflow_worker_dir }}"
|
||||||
@ -245,7 +249,6 @@
|
|||||||
project_src: "{{ airflow_worker_dir }}"
|
project_src: "{{ airflow_worker_dir }}"
|
||||||
files:
|
files:
|
||||||
- "configs/docker-compose-ytdlp-ops.yaml"
|
- "configs/docker-compose-ytdlp-ops.yaml"
|
||||||
- "configs/docker-compose.camoufox.yaml"
|
|
||||||
- "configs/docker-compose.airflow.yml"
|
- "configs/docker-compose.airflow.yml"
|
||||||
state: absent
|
state: absent
|
||||||
remove_volumes: true # Corresponds to docker compose down -v
|
remove_volumes: true # Corresponds to docker compose down -v
|
||||||
@ -259,20 +262,19 @@
|
|||||||
|
|
||||||
- name: "Log: Starting all worker services"
|
- name: "Log: Starting all worker services"
|
||||||
debug:
|
debug:
|
||||||
msg: "Starting all worker services: ytdlp-ops, camoufox, and airflow-worker."
|
msg: "Starting all worker services: ytdlp-ops, and airflow-worker."
|
||||||
|
|
||||||
- name: Start all worker services
|
- name: Start all worker services
|
||||||
community.docker.docker_compose_v2:
|
community.docker.docker_compose_v2:
|
||||||
project_src: "{{ airflow_worker_dir }}"
|
project_src: "{{ airflow_worker_dir }}"
|
||||||
files:
|
files:
|
||||||
- "configs/docker-compose-ytdlp-ops.yaml"
|
- "configs/docker-compose-ytdlp-ops.yaml"
|
||||||
- "configs/docker-compose.camoufox.yaml"
|
|
||||||
- "configs/docker-compose.airflow.yml"
|
- "configs/docker-compose.airflow.yml"
|
||||||
state: present
|
state: present
|
||||||
remove_orphans: true
|
remove_orphans: true
|
||||||
pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"
|
pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"
|
||||||
recreate: always # Corresponds to --force-recreate
|
recreate: always # Corresponds to --force-recreate
|
||||||
|
|
||||||
- name: Include camoufox verification tasks
|
# - name: Include camoufox verification tasks
|
||||||
include_tasks: ../../../tasks/verify_camoufox.yml
|
# include_tasks: ../../../tasks/verify_camoufox.yml
|
||||||
when: not fast_deploy | default(false)
|
# when: not fast_deploy | default(false)
|
||||||
|
|||||||
@ -27,19 +27,55 @@ execution_control:
|
|||||||
|
|
||||||
info_json_generation_policy:
|
info_json_generation_policy:
|
||||||
# Use a standard client. The server will handle token generation.
|
# Use a standard client. The server will handle token generation.
|
||||||
client: web
|
client: tv_simply
|
||||||
|
|
||||||
---
|
---
|
||||||
# Policy: Test download specific DASH formats from a folder of info.jsons.
|
# Policy: Full-stack test with visitor ID rotation and test download.
|
||||||
# This policy uses a single worker to test-download a list of video-only DASH
|
# This policy uses a single worker to fetch info.json files for a list of URLs,
|
||||||
# formats from a directory of existing info.json files. It only downloads the
|
# and then immediately performs a test download (first 10KB) of specified formats.
|
||||||
# first 10KB of each format and sleeps between each file.
|
# It simulates user churn by creating a new profile (and thus a new visitor_id and POT)
|
||||||
name: download_dashy_formats_test
|
# every 250 requests. A short sleep is used between requests.
|
||||||
|
name: full_stack_with_visitor_id_rotation
|
||||||
|
|
||||||
|
settings:
|
||||||
|
mode: full_stack
|
||||||
|
urls_file: "urls.txt" # Placeholder, should be overridden with --set
|
||||||
|
info_json_script: "bin/ytops-client get-info"
|
||||||
|
# Use the modern profile management system to rotate visitor_id.
|
||||||
|
profile_mode: per_worker_with_rotation
|
||||||
|
profile_management:
|
||||||
|
prefix: "visitor_rotator"
|
||||||
|
# Rotate to a new profile generation after 250 requests.
|
||||||
|
max_requests_per_profile: 250
|
||||||
|
|
||||||
|
execution_control:
|
||||||
|
run_until: { cycles: 1 } # Run through the URL list once.
|
||||||
|
workers: 1 # Run with a single worker thread.
|
||||||
|
# A short, fixed sleep between each info.json request.
|
||||||
|
sleep_between_tasks: { min_seconds: 0.75, max_seconds: 0.75 }
|
||||||
|
|
||||||
|
info_json_generation_policy:
|
||||||
|
# Use a standard client. The server will handle token generation.
|
||||||
|
client: tv_simply
|
||||||
|
|
||||||
|
download_policy:
|
||||||
|
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy"
|
||||||
|
downloader: "native-py"
|
||||||
|
extra_args: '--test --cleanup'
|
||||||
|
output_dir: "downloads/fetch_and_test"
|
||||||
|
sleep_between_formats: { min_seconds: 6, max_seconds: 6 }
|
||||||
|
|
||||||
|
---
|
||||||
|
# Policy: Download-only test from a fetch folder (Batch Mode).
|
||||||
|
# This policy scans a directory of existing info.json files once, and performs
|
||||||
|
# a test download (first 10KB) for specific formats. It is designed to run as
|
||||||
|
# a batch job after a 'fetch_only' policy has completed.
|
||||||
|
name: download_only_test_from_fetch_folder
|
||||||
|
|
||||||
settings:
|
settings:
|
||||||
mode: download_only
|
mode: download_only
|
||||||
# Directory of info.json files to process.
|
# Directory of info.json files to process.
|
||||||
info_json_dir: "fetched_info_jsons/visitor_id_rotation" # Assumes output from the above policy
|
info_json_dir: "fetched_info_jsons/visitor_id_rotation" # Assumes output from 'fetch_with_visitor_id_rotation'
|
||||||
|
|
||||||
execution_control:
|
execution_control:
|
||||||
run_until: { cycles: 1 } # Run through the info.json directory once.
|
run_until: { cycles: 1 } # Run through the info.json directory once.
|
||||||
@ -49,10 +85,42 @@ execution_control:
|
|||||||
|
|
||||||
download_policy:
|
download_policy:
|
||||||
# A specific list of video-only DASH formats to test.
|
# A specific list of video-only DASH formats to test.
|
||||||
# The "-dashy" suffix is illustrative; the format IDs must exist in the info.json.
|
|
||||||
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy"
|
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy"
|
||||||
# Use the native Python downloader for better performance and control.
|
|
||||||
downloader: "native-py"
|
downloader: "native-py"
|
||||||
# Pass extra arguments to yt-dlp to perform a "test" download (first 10KB).
|
# Pass extra arguments to perform a "test" download.
|
||||||
extra_args: '--download-sections "*0-10240"'
|
extra_args: '--test --cleanup'
|
||||||
output_dir: "downloads/dash_test"
|
output_dir: "downloads/dash_test"
|
||||||
|
|
||||||
|
---
|
||||||
|
# Policy: Live download from a watch folder (Continuous Mode).
|
||||||
|
# This policy continuously watches a directory for new info.json files and
|
||||||
|
# processes them as they appear. It is designed to work as the second stage
|
||||||
|
# of a pipeline, consuming files generated by a 'fetch_only' policy.
|
||||||
|
name: live_download_from_watch_folder
|
||||||
|
|
||||||
|
settings:
|
||||||
|
mode: download_only
|
||||||
|
info_json_dir: "live_info_json" # A different directory for the live pipeline
|
||||||
|
directory_scan_mode: continuous
|
||||||
|
mark_processed_files: true # Rename files to *.processed to avoid re-downloading.
|
||||||
|
max_files_per_cycle: 50 # Process up to 50 new files each time it checks.
|
||||||
|
sleep_if_no_new_files_seconds: 15
|
||||||
|
|
||||||
|
execution_control:
|
||||||
|
# For 'continuous' mode, a time-based run_until is typical.
|
||||||
|
# {cycles: 1} will scan once, process new files, and exit.
|
||||||
|
# To run for 2 hours, for example, use: run_until: { minutes: 120 }
|
||||||
|
run_until: { cycles: 1 }
|
||||||
|
workers: 4 # Use a few workers to process files in parallel.
|
||||||
|
# sleep_between_tasks controls the pause between processing different info.json files.
|
||||||
|
# To pause before each download attempt starts, use 'pause_before_download_seconds'
|
||||||
|
# in the download_policy section below.
|
||||||
|
sleep_between_tasks: { min_seconds: 0, max_seconds: 0 }
|
||||||
|
|
||||||
|
download_policy:
|
||||||
|
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy"
|
||||||
|
downloader: "native-py"
|
||||||
|
# Example: Pause for a few seconds before starting each download attempt.
|
||||||
|
# pause_before_download_seconds: 2
|
||||||
|
extra_args: '--test --cleanup'
|
||||||
|
output_dir: "downloads/live_dash_test"
|
||||||
|
|||||||
84
policies/5_ban_test_policies.yaml
Normal file
84
policies/5_ban_test_policies.yaml
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
# This file contains policies for testing ban rates and profile survival
|
||||||
|
# under high request counts.
|
||||||
|
|
||||||
|
---
|
||||||
|
# Policy: Single Profile Ban Test (500 Requests)
|
||||||
|
# This policy uses a single worker and a single, non-rotating profile to make
|
||||||
|
# 500 consecutive info.json requests. It is designed to test if and when a
|
||||||
|
# single profile/visitor_id gets banned or rate-limited by YouTube.
|
||||||
|
#
|
||||||
|
# It explicitly disables the server's automatic visitor ID rotation to ensure
|
||||||
|
# the same identity is used for all requests.
|
||||||
|
#
|
||||||
|
# The test will stop if it encounters 3 errors within any 1-minute window,
|
||||||
|
# or a total of 8 errors within any 60-minute window.
|
||||||
|
name: single_profile_ban_test_500
|
||||||
|
|
||||||
|
settings:
|
||||||
|
mode: fetch_only
|
||||||
|
urls_file: "urls.txt" # Override with --set settings.urls_file=...
|
||||||
|
info_json_script: "bin/ytops-client get-info"
|
||||||
|
save_info_json_dir: "fetched_info_jsons/ban_test_single_profile"
|
||||||
|
# Use one worker with one profile that does not rotate automatically.
|
||||||
|
profile_mode: per_worker_with_rotation
|
||||||
|
profile_management:
|
||||||
|
prefix: "ban_test_user"
|
||||||
|
# Set a high request limit to prevent the orchestrator from rotating the profile.
|
||||||
|
max_requests_per_profile: 1000
|
||||||
|
|
||||||
|
execution_control:
|
||||||
|
run_until: { requests: 500 } # Stop after 500 total requests.
|
||||||
|
workers: 1
|
||||||
|
sleep_between_tasks: { min_seconds: 1, max_seconds: 2 }
|
||||||
|
|
||||||
|
info_json_generation_policy:
|
||||||
|
client: "tv_simply" # A typical client for this kind of test.
|
||||||
|
# Explicitly disable the server's visitor ID rotation mechanism.
|
||||||
|
request_params:
|
||||||
|
session_params:
|
||||||
|
visitor_rotation_threshold: 0
|
||||||
|
|
||||||
|
stop_conditions:
|
||||||
|
# Stop if we get 3 or more errors in any 1-minute window (rapid failure).
|
||||||
|
on_error_rate: { max_errors: 3, per_minutes: 1 }
|
||||||
|
# Stop if we get 8 or more 403 errors in any 60-minute window (ban detection).
|
||||||
|
on_cumulative_403: { max_errors: 8, per_minutes: 60 }
|
||||||
|
|
||||||
|
---
|
||||||
|
# Policy: Multi-Profile Survival Test
|
||||||
|
# This policy uses 5 parallel workers, each with its own unique profile.
|
||||||
|
# It tests whether using multiple profiles with the server's default automatic
|
||||||
|
# visitor ID rotation (every 250 requests) can sustain a high request rate
|
||||||
|
# without getting banned.
|
||||||
|
#
|
||||||
|
# The test will run until 1250 total requests have been made (250 per worker),
|
||||||
|
# which should trigger one rotation for each profile.
|
||||||
|
name: multi_profile_survival_test
|
||||||
|
|
||||||
|
settings:
|
||||||
|
mode: fetch_only
|
||||||
|
urls_file: "urls.txt" # Override with --set settings.urls_file=...
|
||||||
|
info_json_script: "bin/ytops-client get-info"
|
||||||
|
save_info_json_dir: "fetched_info_jsons/ban_test_multi_profile"
|
||||||
|
# Use 5 workers, each getting its own rotating profile.
|
||||||
|
profile_mode: per_worker_with_rotation
|
||||||
|
profile_management:
|
||||||
|
prefix: "survival_test_user"
|
||||||
|
# Use the default rotation threshold of 250 requests per profile.
|
||||||
|
max_requests_per_profile: 250
|
||||||
|
|
||||||
|
execution_control:
|
||||||
|
run_until: { requests: 1250 } # 5 workers * 250 requests/rotation = 1250 total.
|
||||||
|
workers: 5
|
||||||
|
sleep_between_tasks: { min_seconds: 1, max_seconds: 2 }
|
||||||
|
|
||||||
|
info_json_generation_policy:
|
||||||
|
client: "tv_simply"
|
||||||
|
# No request_params are needed here; we want to use the server's default
|
||||||
|
# visitor ID rotation behavior.
|
||||||
|
|
||||||
|
stop_conditions:
|
||||||
|
# Stop if we get 3 or more errors in any 1-minute window (rapid failure).
|
||||||
|
on_error_rate: { max_errors: 3, per_minutes: 1 }
|
||||||
|
# Stop if we get 8 or more 403 errors in any 60-minute window (ban detection).
|
||||||
|
on_cumulative_403: { max_errors: 8, per_minutes: 60 }
|
||||||
@ -27,7 +27,10 @@ def main():
|
|||||||
if last_arg.startswith('-') and len(last_arg) == 11:
|
if last_arg.startswith('-') and len(last_arg) == 11:
|
||||||
import re
|
import re
|
||||||
if re.fullmatch(r'-[a-zA-Z0-9_-]{10}', last_arg):
|
if re.fullmatch(r'-[a-zA-Z0-9_-]{10}', last_arg):
|
||||||
sys.argv.insert(len(sys.argv) - 1, '--')
|
# Only insert '--' if it's not already the preceding argument.
|
||||||
|
# This prevents `stress_policy_tool` which already adds '--' from causing an error.
|
||||||
|
if sys.argv[-2] != '--':
|
||||||
|
sys.argv.insert(len(sys.argv) - 1, '--')
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="YT Ops Client Tools",
|
description="YT Ops Client Tools",
|
||||||
|
|||||||
@ -12,14 +12,16 @@ import glob
|
|||||||
import shutil
|
import shutil
|
||||||
import re
|
import re
|
||||||
import shlex
|
import shlex
|
||||||
|
import threading
|
||||||
import time
|
import time
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import aria2p
|
import aria2p
|
||||||
from aria2p.utils import human_readable_bytes
|
from aria2p.utils import human_readable_bytes
|
||||||
|
import yt_dlp
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("aria2p is not installed. Please install it with: pip install aria2p", file=sys.stderr)
|
print("aria2p or yt-dlp is not installed. Please install them with: pip install aria2p yt-dlp", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
logger = logging.getLogger('download_aria_tool')
|
logger = logging.getLogger('download_aria_tool')
|
||||||
@ -61,15 +63,18 @@ cat latest-info.json | yt-ops-client download aria-rpc -f "299/137" \\
|
|||||||
parser.add_argument('--aria-host', default='localhost', help='The host of the aria2c RPC server. Default: localhost.')
|
parser.add_argument('--aria-host', default='localhost', help='The host of the aria2c RPC server. Default: localhost.')
|
||||||
parser.add_argument('--aria-port', type=int, default=6800, help='The port of the aria2c RPC server. Default: 6800.')
|
parser.add_argument('--aria-port', type=int, default=6800, help='The port of the aria2c RPC server. Default: 6800.')
|
||||||
parser.add_argument('--aria-secret', help='The secret token for the aria2c RPC server (often required, e.g., "SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX").')
|
parser.add_argument('--aria-secret', help='The secret token for the aria2c RPC server (often required, e.g., "SQGCQPLVFQIASMPNPOJYLVGJYLMIDIXDXAIXOTX").')
|
||||||
parser.add_argument('--proxy', help='Proxy to use for the download, e.g., "socks5://127.0.0.1:1080".')
|
parser.add_argument('--proxy', help='Proxy to use for the download, e.g., "socks5://127.0.0.1:1080". This sets the "all-proxy" option in aria2c.')
|
||||||
parser.add_argument('--downloader-args', help='Arguments for aria2c, in yt-dlp format (e.g., "aria2c:[-x 8, -k 1M]").')
|
parser.add_argument('--downloader-args', help='Arguments for aria2c, in yt-dlp format (e.g., "aria2c:[-x 8, -k 1M]").')
|
||||||
parser.add_argument('--wait', action='store_true', help='Wait for the download to complete and report its status. Note: This makes the operation synchronous and will block until the download finishes.')
|
parser.add_argument('--wait', action='store_true', help='Wait for the download to complete and report its status. Note: This makes the operation synchronous and will block until the download finishes.')
|
||||||
parser.add_argument('--wait-timeout', help='Timeout in seconds for waiting on downloads. Use "auto" to calculate based on a minimum speed of 200KiB/s. Requires --wait. Default: no timeout.')
|
parser.add_argument('--wait-timeout', help='Timeout in seconds for waiting on downloads. Use "auto" to calculate based on a minimum speed of 200KiB/s. Requires --wait. Default: no timeout.')
|
||||||
|
parser.add_argument('--max-concurrent-fragments', type=int, default=8, help='Maximum number of fragments to download concurrently when using --wait. Mimics aria2c\'s -j option. Default: 8.')
|
||||||
parser.add_argument('--auto-merge-fragments', action='store_true', help='Automatically merge fragments after download. Requires --wait and assumes the script has filesystem access to the aria2c host.')
|
parser.add_argument('--auto-merge-fragments', action='store_true', help='Automatically merge fragments after download. Requires --wait and assumes the script has filesystem access to the aria2c host.')
|
||||||
parser.add_argument('--remove-fragments-after-merge', action='store_true', help='Delete individual fragment files after a successful merge. Requires --auto-merge-fragments.')
|
parser.add_argument('--remove-fragments-after-merge', action='store_true', help='Delete individual fragment files after a successful merge. Requires --auto-merge-fragments.')
|
||||||
parser.add_argument('--cleanup', action='store_true', help='After a successful download, remove the final file(s) from the filesystem. For fragmented downloads, this implies --remove-fragments-after-merge.')
|
parser.add_argument('--cleanup', action='store_true', help='After a successful download, remove the final file(s) from the filesystem. For fragmented downloads, this implies --remove-fragments-after-merge.')
|
||||||
parser.add_argument('--remove-on-complete', action=argparse.BooleanOptionalAction, default=True, help='Remove the download from aria2c history on successful completion. Use --no-remove-on-complete to disable. May fail on older aria2c daemons.')
|
parser.add_argument('--remove-on-complete', action=argparse.BooleanOptionalAction, default=True, help='Remove the download from aria2c history on successful completion. Use --no-remove-on-complete to disable. May fail on older aria2c daemons.')
|
||||||
parser.add_argument('--purge-on-complete', action='store_true', help='Use aria2.purgeDownloadResult to clear ALL completed/failed downloads from history on success. Use as a workaround for older daemons.')
|
parser.add_argument('--purge-on-complete', action='store_true', help='Use aria2.purgeDownloadResult to clear ALL completed/failed downloads from history on success. Use as a workaround for older daemons.')
|
||||||
|
parser.add_argument('--add-header', action='append', help='Add a custom HTTP header for the download. Format: "Key: Value". Can be used multiple times.')
|
||||||
|
parser.add_argument('--user-agent', help='Specify a custom User-Agent. Overrides any User-Agent from info.json, --add-header, or the default.')
|
||||||
parser.add_argument('--verbose', action='store_true', help='Enable verbose output for this script.')
|
parser.add_argument('--verbose', action='store_true', help='Enable verbose output for this script.')
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
@ -101,6 +106,10 @@ def parse_aria_error(download):
|
|||||||
if not error_message:
|
if not error_message:
|
||||||
return f"Unknown aria2c error (Code: {error_code})"
|
return f"Unknown aria2c error (Code: {error_code})"
|
||||||
|
|
||||||
|
# Handle specific error codes that provide more context
|
||||||
|
if error_code == 24: # Authorization failed
|
||||||
|
return f"HTTP Authorization Failed (Error 24). The URL may have expired or requires valid cookies/headers. Raw message: {error_message}"
|
||||||
|
|
||||||
# Check for common HTTP errors in the message
|
# Check for common HTTP errors in the message
|
||||||
http_status_match = re.search(r'HTTP status (\d+)', error_message)
|
http_status_match = re.search(r'HTTP status (\d+)', error_message)
|
||||||
if http_status_match:
|
if http_status_match:
|
||||||
@ -144,6 +153,8 @@ def parse_aria_args_to_options(args_str):
|
|||||||
parser.add_argument('-x', '--max-connection-per-server')
|
parser.add_argument('-x', '--max-connection-per-server')
|
||||||
parser.add_argument('-k', '--min-split-size')
|
parser.add_argument('-k', '--min-split-size')
|
||||||
parser.add_argument('-s', '--split')
|
parser.add_argument('-s', '--split')
|
||||||
|
parser.add_argument('--http-proxy')
|
||||||
|
parser.add_argument('--https-proxy')
|
||||||
parser.add_argument('--all-proxy')
|
parser.add_argument('--all-proxy')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -151,8 +162,10 @@ def parse_aria_args_to_options(args_str):
|
|||||||
known_args, unknown_args = parser.parse_known_args(arg_list)
|
known_args, unknown_args = parser.parse_known_args(arg_list)
|
||||||
if unknown_args:
|
if unknown_args:
|
||||||
logger.warning(f"Ignoring unknown arguments in --downloader-args: {unknown_args}")
|
logger.warning(f"Ignoring unknown arguments in --downloader-args: {unknown_args}")
|
||||||
# Convert to dict, removing None values
|
# Convert to dict, removing None values.
|
||||||
return {k: v for k, v in vars(known_args).items() if v is not None}
|
# Convert to dict, removing None values, and converting underscores back to hyphens
|
||||||
|
# to match the option format expected by aria2c's RPC interface.
|
||||||
|
return {k.replace('_', '-'): v for k, v in vars(known_args).items() if v is not None}
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.warning(f"Failed to parse arguments inside --downloader-args: '{inner_args_str}'")
|
logger.warning(f"Failed to parse arguments inside --downloader-args: '{inner_args_str}'")
|
||||||
return {}
|
return {}
|
||||||
@ -161,6 +174,9 @@ def parse_aria_args_to_options(args_str):
|
|||||||
def main_download_aria(args):
|
def main_download_aria(args):
|
||||||
"""Main logic for the 'download-aria' command."""
|
"""Main logic for the 'download-aria' command."""
|
||||||
log_level = logging.DEBUG if args.verbose else logging.INFO
|
log_level = logging.DEBUG if args.verbose else logging.INFO
|
||||||
|
# Reconfigure root logger to ensure our settings are applied.
|
||||||
|
for handler in logging.root.handlers[:]:
|
||||||
|
logging.root.removeHandler(handler)
|
||||||
logging.basicConfig(level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', stream=sys.stderr)
|
logging.basicConfig(level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', stream=sys.stderr)
|
||||||
|
|
||||||
if args.remove_fragments_after_merge and not args.auto_merge_fragments:
|
if args.remove_fragments_after_merge and not args.auto_merge_fragments:
|
||||||
@ -198,25 +214,43 @@ def main_download_aria(args):
|
|||||||
logger.error(f"Failed to parse info.json from {input_source_name}. Is the input valid JSON?")
|
logger.error(f"Failed to parse info.json from {input_source_name}. Is the input valid JSON?")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
# Find the requested format, supporting yt-dlp style selectors
|
# Find the requested format using yt-dlp's own selection logic
|
||||||
target_format = None
|
try:
|
||||||
# A format selector can be a comma-separated list of preferences,
|
# We don't need a full ydl instance, just the format selection logic.
|
||||||
# where each preference can be a slash-separated list of format_ids.
|
ydl = yt_dlp.YoutubeDL({'quiet': True, 'logger': logger, 'format': args.format})
|
||||||
# e.g., "299/137/136,140" means try 299, then 137, then 136, then 140.
|
formats = info_data.get('formats', [])
|
||||||
format_preferences = [item.strip() for sublist in (i.split('/') for i in args.format.split(',')) for item in sublist if item.strip()]
|
selector = ydl.build_format_selector(args.format)
|
||||||
|
ctx = {
|
||||||
|
'formats': formats,
|
||||||
|
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
|
||||||
|
'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats)
|
||||||
|
or all(f.get('acodec') == 'none' for f in formats)),
|
||||||
|
}
|
||||||
|
selected_formats = list(selector(ctx))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to select format with selector '{args.format}': {e}", exc_info=args.verbose)
|
||||||
|
return 1
|
||||||
|
|
||||||
available_formats_map = {f['format_id']: f for f in info_data.get('formats', []) if 'format_id' in f}
|
if not selected_formats:
|
||||||
|
|
||||||
for format_id in format_preferences:
|
|
||||||
if format_id in available_formats_map:
|
|
||||||
target_format = available_formats_map[format_id]
|
|
||||||
logger.info(f"Selected format ID '{format_id}' from selector '{args.format}'.")
|
|
||||||
break
|
|
||||||
|
|
||||||
if not target_format:
|
|
||||||
logger.error(f"No suitable format found for selector '{args.format}' in info.json.")
|
logger.error(f"No suitable format found for selector '{args.format}' in info.json.")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
# The selector might return multiple results if ',' is used. We'll process the first one.
|
||||||
|
target_format = selected_formats[0]
|
||||||
|
if len(selected_formats) > 1:
|
||||||
|
logger.warning(f"Format selector '{args.format}' resolved to multiple format combinations. Only the first one will be downloaded.")
|
||||||
|
|
||||||
|
formats_to_download = target_format.get('requested_formats', [target_format])
|
||||||
|
if len(formats_to_download) > 1:
|
||||||
|
logger.warning(
|
||||||
|
f"The selected format is a combination of {len(formats_to_download)} streams. "
|
||||||
|
f"This tool does not support merging separate video/audio streams. "
|
||||||
|
f"Only the first stream (format_id: {formats_to_download[0].get('format_id')}) will be downloaded. "
|
||||||
|
f"To download all streams, please specify their format IDs separately."
|
||||||
|
)
|
||||||
|
|
||||||
|
target_format = formats_to_download[0]
|
||||||
|
|
||||||
# Get file size for auto-timeout and dynamic options
|
# Get file size for auto-timeout and dynamic options
|
||||||
total_filesize = target_format.get('filesize') or target_format.get('filesize_approx')
|
total_filesize = target_format.get('filesize') or target_format.get('filesize_approx')
|
||||||
|
|
||||||
@ -231,9 +265,9 @@ def main_download_aria(args):
|
|||||||
# Prepare options for aria2
|
# Prepare options for aria2
|
||||||
aria_options = {
|
aria_options = {
|
||||||
# Options from yt-dlp's aria2c integration for performance and reliability
|
# Options from yt-dlp's aria2c integration for performance and reliability
|
||||||
|
'continue': 'true',
|
||||||
'max-connection-per-server': 16,
|
'max-connection-per-server': 16,
|
||||||
'split': 16,
|
'split': 16,
|
||||||
'min-split-size': '1M',
|
|
||||||
'http-accept-gzip': 'true',
|
'http-accept-gzip': 'true',
|
||||||
'file-allocation': 'none',
|
'file-allocation': 'none',
|
||||||
}
|
}
|
||||||
@ -243,20 +277,59 @@ def main_download_aria(args):
|
|||||||
|
|
||||||
custom_options = parse_aria_args_to_options(args.downloader_args)
|
custom_options = parse_aria_args_to_options(args.downloader_args)
|
||||||
|
|
||||||
# Dynamically set min-split-size if not overridden by user
|
# Set min-split-size. yt-dlp's default is 1M.
|
||||||
if 'min_split_size' not in custom_options and total_filesize:
|
if 'min-split-size' not in custom_options:
|
||||||
if total_filesize > 100 * 1024 * 1024: # 100 MiB
|
if total_filesize and total_filesize > 100 * 1024 * 1024: # 100 MiB
|
||||||
aria_options['min-split-size'] = '5M'
|
aria_options['min-split-size'] = '5M'
|
||||||
logger.info("File is > 100MiB, dynamically setting min-split-size to 5M.")
|
logger.info("File is > 100MiB, dynamically setting min-split-size to 5M.")
|
||||||
|
else:
|
||||||
|
aria_options['min-split-size'] = '1M'
|
||||||
|
|
||||||
if custom_options:
|
if custom_options:
|
||||||
aria_options.update(custom_options)
|
aria_options.update(custom_options)
|
||||||
logger.info(f"Applied custom aria2c options from --downloader-args: {custom_options}")
|
logger.info(f"Applied custom aria2c options from --downloader-args: {custom_options}")
|
||||||
|
|
||||||
|
# For older aria2c versions, SOCKS5 proxy must be specified with an 'http://' scheme.
|
||||||
|
if 'all-proxy' in aria_options and isinstance(aria_options['all-proxy'], str) and aria_options['all-proxy'].startswith('socks5://'):
|
||||||
|
proxy_url = aria_options['all-proxy']
|
||||||
|
logger.info("Replacing 'socks5://' with 'http://' in proxy URL for aria2c compatibility.")
|
||||||
|
aria_options['all-proxy'] = 'http://' + proxy_url[len('socks5://'):]
|
||||||
|
|
||||||
aria_options['out'] = filename
|
aria_options['out'] = filename
|
||||||
|
|
||||||
# Add headers from info.json, mimicking yt-dlp's behavior for aria2c
|
# Add headers from info.json, and allow overriding/adding with --add-header
|
||||||
headers = target_format.get('http_headers')
|
headers = target_format.get('http_headers', {}).copy()
|
||||||
|
|
||||||
|
if args.add_header:
|
||||||
|
for header in args.add_header:
|
||||||
|
if ':' not in header:
|
||||||
|
logger.error(f"Invalid header format in --add-header: '{header}'. Expected 'Key: Value'.")
|
||||||
|
return 1
|
||||||
|
key, value = header.split(':', 1)
|
||||||
|
key = key.strip()
|
||||||
|
value = value.strip()
|
||||||
|
if key in headers:
|
||||||
|
logger.info(f"Overwriting header '{key}' from info.json with value from command line.")
|
||||||
|
else:
|
||||||
|
logger.info(f"Adding header from command line: {key}: {value}")
|
||||||
|
headers[key] = value
|
||||||
|
|
||||||
|
# Enforce a consistent User-Agent.
|
||||||
|
# First, remove any User-Agent that might have come from info.json, case-insensitively.
|
||||||
|
for key in list(headers.keys()):
|
||||||
|
if key.lower() == 'user-agent':
|
||||||
|
del headers[key]
|
||||||
|
|
||||||
|
# Set the default Cobalt User-Agent.
|
||||||
|
default_user_agent = 'Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version'
|
||||||
|
headers['User-Agent'] = default_user_agent
|
||||||
|
logger.info(f"Set default User-Agent to: {default_user_agent}")
|
||||||
|
|
||||||
|
# The --user-agent flag has the highest precedence and can override the default.
|
||||||
|
if args.user_agent:
|
||||||
|
headers['User-Agent'] = args.user_agent
|
||||||
|
logger.info(f"Overriding User-Agent with value from --user-agent: {args.user_agent}")
|
||||||
|
|
||||||
if headers:
|
if headers:
|
||||||
header_list = [f'{key}: {value}' for key, value in headers.items()]
|
header_list = [f'{key}: {value}' for key, value in headers.items()]
|
||||||
aria_options['header'] = header_list
|
aria_options['header'] = header_list
|
||||||
@ -268,6 +341,12 @@ def main_download_aria(args):
|
|||||||
else:
|
else:
|
||||||
logger.debug(f" Header: {h}")
|
logger.debug(f" Header: {h}")
|
||||||
|
|
||||||
|
# Final check: ensure all option values are strings, as required by aria2c RPC.
|
||||||
|
# The 'header' option is a list of strings, which is a special case and should be preserved.
|
||||||
|
for key, value in aria_options.items():
|
||||||
|
if key != 'header' and not isinstance(value, str):
|
||||||
|
aria_options[key] = str(value)
|
||||||
|
|
||||||
is_fragmented = 'fragments' in target_format
|
is_fragmented = 'fragments' in target_format
|
||||||
if not is_fragmented:
|
if not is_fragmented:
|
||||||
url = target_format.get('url')
|
url = target_format.get('url')
|
||||||
@ -305,10 +384,20 @@ def main_download_aria(args):
|
|||||||
logger.error(f"Invalid --wait-timeout value: '{args.wait_timeout}'. Must be a positive integer or 'auto'.")
|
logger.error(f"Invalid --wait-timeout value: '{args.wait_timeout}'. Must be a positive integer or 'auto'.")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
# Determine the download directory for aria2c.
|
||||||
|
# If --remote-dir is specified, it takes precedence.
|
||||||
|
# Otherwise, assume a local setup and use --output-dir.
|
||||||
|
# It's crucial to use an absolute path to avoid ambiguity for the aria2c daemon.
|
||||||
|
download_dir_for_aria = args.remote_dir
|
||||||
|
if not download_dir_for_aria:
|
||||||
|
local_dir = args.output_dir or '.'
|
||||||
|
download_dir_for_aria = os.path.abspath(local_dir)
|
||||||
|
logger.info(f"No --remote-dir specified. Using local path for aria2c download directory: {download_dir_for_aria}")
|
||||||
|
|
||||||
if is_fragmented:
|
if is_fragmented:
|
||||||
return download_fragments_aria(args, api, target_format, filename, aria_options, timeout_seconds, remote_dir=args.remote_dir)
|
return download_fragments_aria(args, api, target_format, filename, aria_options, timeout_seconds, remote_dir=download_dir_for_aria)
|
||||||
else:
|
else:
|
||||||
return download_url_aria(args, api, url, filename, aria_options, timeout_seconds, remote_dir=args.remote_dir)
|
return download_url_aria(args, api, url, filename, aria_options, timeout_seconds, remote_dir=download_dir_for_aria)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"An error occurred while communicating with aria2c: {e}", exc_info=args.verbose)
|
logger.error(f"An error occurred while communicating with aria2c: {e}", exc_info=args.verbose)
|
||||||
@ -325,87 +414,98 @@ def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, r
|
|||||||
logger.error("Failed to add download to aria2c. The API returned an empty result.")
|
logger.error("Failed to add download to aria2c. The API returned an empty result.")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
# Handle older aria2p versions that return a single Download object instead of a list
|
|
||||||
download = downloads[0] if isinstance(downloads, list) else downloads
|
download = downloads[0] if isinstance(downloads, list) else downloads
|
||||||
logger.info(f"Successfully added download to aria2c. GID: {download.gid}")
|
logger.info(f"Successfully added download to aria2c. GID: {download.gid}")
|
||||||
|
|
||||||
if args.wait:
|
if args.wait:
|
||||||
logger.info(f"Waiting for download {download.gid} to complete...")
|
logger.info(f"Waiting for download {download.gid} to complete using WebSocket events...")
|
||||||
start_time = time.time()
|
download_finished_event = threading.Event()
|
||||||
|
final_status = {}
|
||||||
|
|
||||||
|
def on_complete(api_ref, event_gid):
|
||||||
|
if event_gid == download.gid:
|
||||||
|
logger.debug(f"WebSocket: GID {event_gid} completed.")
|
||||||
|
final_status['status'] = 'complete'
|
||||||
|
download_finished_event.set()
|
||||||
|
|
||||||
|
def on_error(api_ref, event_gid):
|
||||||
|
if event_gid == download.gid:
|
||||||
|
logger.debug(f"WebSocket: GID {event_gid} errored.")
|
||||||
|
final_status['status'] = 'error'
|
||||||
|
download_finished_event.set()
|
||||||
|
|
||||||
|
def on_stop(api_ref, event_gid):
|
||||||
|
if event_gid == download.gid:
|
||||||
|
logger.debug(f"WebSocket: GID {event_gid} stopped.")
|
||||||
|
final_status['status'] = 'stopped'
|
||||||
|
download_finished_event.set()
|
||||||
|
|
||||||
|
listener_thread = threading.Thread(
|
||||||
|
target=api.listen_to_notifications,
|
||||||
|
kwargs={
|
||||||
|
'on_download_complete': on_complete,
|
||||||
|
'on_download_error': on_error,
|
||||||
|
'on_download_stop': on_stop,
|
||||||
|
'timeout': 1,
|
||||||
|
'handle_signals': False
|
||||||
|
},
|
||||||
|
daemon=True
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
listener_thread.start()
|
||||||
if timeout_seconds and (time.time() - start_time > timeout_seconds):
|
finished = download_finished_event.wait(timeout=timeout_seconds)
|
||||||
raise TimeoutError(f"Download did not complete within {timeout_seconds}s timeout.")
|
if not finished and not download_finished_event.is_set():
|
||||||
|
raise TimeoutError(f"Download did not complete within {timeout_seconds}s timeout.")
|
||||||
# Re-fetch the download object to get the latest status
|
except KeyboardInterrupt:
|
||||||
download.update()
|
|
||||||
# A download is no longer active if it's complete, errored, paused, or removed.
|
|
||||||
if download.status not in ('active', 'waiting'):
|
|
||||||
break
|
|
||||||
|
|
||||||
progress_info = (
|
|
||||||
f"\rGID {download.gid}: {download.status} "
|
|
||||||
f"{download.progress_string()} "
|
|
||||||
f"({download.download_speed_string()}) "
|
|
||||||
f"ETA: {download.eta_string()}"
|
|
||||||
)
|
|
||||||
sys.stdout.write(progress_info)
|
|
||||||
sys.stdout.flush()
|
|
||||||
time.sleep(0.5)
|
|
||||||
except (KeyboardInterrupt, TimeoutError) as e:
|
|
||||||
sys.stdout.write('\n')
|
sys.stdout.write('\n')
|
||||||
if isinstance(e, KeyboardInterrupt):
|
logger.warning("Wait interrupted by user. Cleaning up download...")
|
||||||
logger.warning("Wait interrupted by user. Cleaning up download...")
|
cleanup_aria_download(api, [download])
|
||||||
cleanup_aria_download(api, [download])
|
return 130
|
||||||
return 130
|
except TimeoutError as e:
|
||||||
else: # TimeoutError
|
logger.error(f"Download timed out. Cleaning up... Error: {e}")
|
||||||
logger.error(f"Download timed out. Cleaning up... Error: {e}")
|
cleanup_aria_download(api, [download])
|
||||||
cleanup_aria_download(api, [download])
|
return 1
|
||||||
return 1
|
finally:
|
||||||
|
api.stop_listening()
|
||||||
|
if listener_thread.is_alive():
|
||||||
|
listener_thread.join(timeout=2)
|
||||||
|
|
||||||
|
# Re-fetch download object to get final details
|
||||||
|
try:
|
||||||
|
download.update()
|
||||||
except aria2p.ClientException as e:
|
except aria2p.ClientException as e:
|
||||||
# This can happen if the download completes and is removed by aria2c
|
logger.warning(f"Could not update final status for GID {download.gid} (maybe removed on completion?): {e}.")
|
||||||
# before we can check its final status. Assume success in this case.
|
if final_status.get('status') != 'complete':
|
||||||
logger.warning(f"Could not get final status for GID {download.gid} (maybe removed on completion?): {e}. Assuming success.")
|
logger.error(f"Download {download.gid} failed, but could not retrieve final error details.")
|
||||||
print(f"Download for GID {download.gid} presumed successful.")
|
return 1
|
||||||
return 0
|
|
||||||
|
|
||||||
sys.stdout.write('\n') # Newline after progress bar
|
if final_status.get('status') == 'complete':
|
||||||
|
|
||||||
# Final status check (no need to update again, we have the latest status)
|
|
||||||
if download.status == 'complete':
|
|
||||||
logger.info(f"Download {download.gid} completed successfully.")
|
logger.info(f"Download {download.gid} completed successfully.")
|
||||||
|
downloaded_filepath_remote = download.files[0].path if download.files else None
|
||||||
downloaded_filepath_remote = None
|
if downloaded_filepath_remote:
|
||||||
if download.files:
|
|
||||||
downloaded_filepath_remote = download.files[0].path
|
|
||||||
print(f"Download successful: {downloaded_filepath_remote}")
|
print(f"Download successful: {downloaded_filepath_remote}")
|
||||||
else:
|
else:
|
||||||
print("Download successful, but no file path reported by aria2c.")
|
print("Download successful, but no file path reported by aria2c.")
|
||||||
|
|
||||||
if args.cleanup and downloaded_filepath_remote:
|
if args.cleanup and downloaded_filepath_remote:
|
||||||
local_filepath = None
|
|
||||||
# To map remote path to local, we need remote_dir and a local equivalent.
|
|
||||||
# We'll use fragments_dir as the local equivalent, which defaults to output_dir.
|
|
||||||
local_base_dir = args.fragments_dir or args.output_dir or '.'
|
local_base_dir = args.fragments_dir or args.output_dir or '.'
|
||||||
if remote_dir:
|
if remote_dir and downloaded_filepath_remote.startswith(remote_dir):
|
||||||
if downloaded_filepath_remote.startswith(remote_dir):
|
relative_path = os.path.relpath(downloaded_filepath_remote, remote_dir)
|
||||||
relative_path = os.path.relpath(downloaded_filepath_remote, remote_dir)
|
local_filepath = os.path.join(local_base_dir, relative_path)
|
||||||
local_filepath = os.path.join(local_base_dir, relative_path)
|
|
||||||
else:
|
|
||||||
logger.warning(f"Cleanup: Downloaded file path '{downloaded_filepath_remote}' does not start with remote-dir '{remote_dir}'. Cannot map to local path.")
|
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Cleanup: --remote-dir not specified. Assuming download path is accessible locally as '{downloaded_filepath_remote}'.")
|
|
||||||
local_filepath = downloaded_filepath_remote
|
local_filepath = downloaded_filepath_remote
|
||||||
|
if not remote_dir:
|
||||||
|
logger.warning(f"Cleanup: --remote-dir not specified. Assuming download path is accessible locally as '{local_filepath}'.")
|
||||||
|
|
||||||
if local_filepath:
|
try:
|
||||||
try:
|
if os.path.exists(local_filepath):
|
||||||
if os.path.exists(local_filepath):
|
os.remove(local_filepath)
|
||||||
os.remove(local_filepath)
|
logger.info(f"Cleanup: Removed downloaded file '{local_filepath}'")
|
||||||
logger.info(f"Cleanup: Removed downloaded file '{local_filepath}'")
|
else:
|
||||||
else:
|
logger.warning(f"Cleanup: File not found at expected local path '{local_filepath}'. Skipping removal.")
|
||||||
logger.warning(f"Cleanup: File not found at expected local path '{local_filepath}'. Skipping removal.")
|
except OSError as e:
|
||||||
except OSError as e:
|
logger.error(f"Cleanup failed: Could not remove file '{local_filepath}': {e}")
|
||||||
logger.error(f"Cleanup failed: Could not remove file '{local_filepath}': {e}")
|
|
||||||
elif args.cleanup:
|
elif args.cleanup:
|
||||||
logger.warning("Cleanup requested, but no downloaded file path was reported by aria2c.")
|
logger.warning("Cleanup requested, but no downloaded file path was reported by aria2c.")
|
||||||
|
|
||||||
@ -417,11 +517,10 @@ def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, r
|
|||||||
logger.warning(f"Failed to purge download history: {e}")
|
logger.warning(f"Failed to purge download history: {e}")
|
||||||
elif args.remove_on_complete:
|
elif args.remove_on_complete:
|
||||||
try:
|
try:
|
||||||
api.remove_download_result(download)
|
api.client.remove_download_result(download.gid)
|
||||||
logger.info(f"Removed download {download.gid} from aria2c history.")
|
logger.info(f"Removed download {download.gid} from aria2c history.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to remove download {download.gid} from history: {e}")
|
logger.warning(f"Failed to remove download {download.gid} from history: {e}")
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
detailed_error = parse_aria_error(download)
|
detailed_error = parse_aria_error(download)
|
||||||
@ -445,243 +544,236 @@ def download_fragments_aria(args, api, target_format, filename, aria_options, ti
|
|||||||
)
|
)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
# We need to set the 'dir' option for all fragments if specified.
|
|
||||||
# The 'out' option will be set per-fragment.
|
|
||||||
frag_aria_options = aria_options.copy()
|
frag_aria_options = aria_options.copy()
|
||||||
frag_aria_options.pop('out', None) # Remove the main 'out' option
|
frag_aria_options.pop('out', None)
|
||||||
|
|
||||||
if remote_dir:
|
if remote_dir:
|
||||||
frag_aria_options['dir'] = remote_dir
|
frag_aria_options['dir'] = remote_dir
|
||||||
logger.info(f"Instructing remote aria2c to save fragments to: {remote_dir}")
|
logger.info(f"Instructing remote aria2c to save fragments to: {remote_dir}")
|
||||||
|
|
||||||
base_filename, file_ext = os.path.splitext(filename)
|
base_filename, file_ext = os.path.splitext(filename)
|
||||||
|
logger.info(f"Preparing {len(fragments)} fragments for a batch submission to aria2c...")
|
||||||
calls = []
|
multicall_payload = []
|
||||||
for i, fragment in enumerate(fragments):
|
for i, fragment in enumerate(fragments):
|
||||||
frag_url = fragment.get('url')
|
frag_url = fragment.get('url') or urljoin(fragment_base_url, fragment['path'])
|
||||||
if not frag_url:
|
if not frag_url:
|
||||||
if not fragment_base_url:
|
logger.error(f"Fragment {i} has no URL and no fragment_base_url is available. Aborting.")
|
||||||
logger.error(f"Fragment {i} has no URL and no fragment_base_url is available. Aborting.")
|
return 1
|
||||||
return 1
|
|
||||||
frag_url = urljoin(fragment_base_url, fragment['path'])
|
|
||||||
|
|
||||||
# Use the base filename from the main file, but add fragment identifier
|
|
||||||
fragment_filename = f"{base_filename}-Frag{i}{file_ext}"
|
fragment_filename = f"{base_filename}-Frag{i}{file_ext}"
|
||||||
|
|
||||||
current_frag_options = frag_aria_options.copy()
|
current_frag_options = frag_aria_options.copy()
|
||||||
current_frag_options['out'] = os.path.basename(fragment_filename)
|
current_frag_options['out'] = os.path.basename(fragment_filename)
|
||||||
|
|
||||||
# Prepare parameters for multicall in the format:
|
# The aria2p library will handle adding the secret token to each call in the multicall.
|
||||||
# {"methodName": "aria2.addUri", "params": [["url"], {"out": "file.mp4"}]}
|
|
||||||
# The secret token is automatically added by aria2p.
|
|
||||||
params = [[frag_url], current_frag_options]
|
params = [[frag_url], current_frag_options]
|
||||||
call_struct = {
|
multicall_payload.append({'methodName': 'aria2.addUri', 'params': params})
|
||||||
"methodName": api.client.ADD_URI,
|
|
||||||
"params": params
|
|
||||||
}
|
|
||||||
calls.append(call_struct)
|
|
||||||
|
|
||||||
results = api.client.multicall(calls)
|
if not args.wait:
|
||||||
if not results:
|
# Asynchronous mode: submit all fragments at once and exit.
|
||||||
logger.error("Failed to add fragments to aria2c. The API returned an empty result.")
|
gids, failed_count = [], 0
|
||||||
return 1
|
|
||||||
|
|
||||||
# The result of a multicall of addUri is a list of lists, where each inner list
|
|
||||||
# contains the GID of one download, e.g., [['gid1'], ['gid2']].
|
|
||||||
# A failed call for a fragment may result in a fault struct dict instead of a list.
|
|
||||||
# We extract GIDs from successful calls.
|
|
||||||
gids = [result[0] for result in results if isinstance(result, list) and result]
|
|
||||||
|
|
||||||
if len(gids) != len(fragments):
|
|
||||||
failed_count = len(fragments) - len(gids)
|
|
||||||
logger.warning(f"{failed_count} out of {len(fragments)} fragments failed to be added to aria2c.")
|
|
||||||
|
|
||||||
if not gids:
|
|
||||||
logger.error("Failed to add any fragments to aria2c. All submissions failed.")
|
|
||||||
return 1
|
|
||||||
|
|
||||||
logger.info(f"Successfully added {len(gids)} fragments to aria2c.")
|
|
||||||
if args.verbose:
|
|
||||||
logger.debug(f"GIDs: {gids}")
|
|
||||||
|
|
||||||
if args.wait:
|
|
||||||
logger.info(f"Waiting for {len(gids)} fragments to complete...")
|
|
||||||
start_time = time.time()
|
|
||||||
downloads_to_cleanup = []
|
|
||||||
try:
|
try:
|
||||||
while True:
|
logger.info(f"Submitting {len(multicall_payload)} fragments to aria2c in a single batch request...")
|
||||||
if timeout_seconds and (time.time() - start_time > timeout_seconds):
|
# The aria2p client library correctly handles authentication for multicalls.
|
||||||
raise TimeoutError(f"Fragment downloads did not complete within {timeout_seconds}s timeout.")
|
results = api.client.multicall(multicall_payload)
|
||||||
|
for i, result in enumerate(results):
|
||||||
|
if isinstance(result, list) and len(result) == 1 and isinstance(result[0], str):
|
||||||
|
gids.append(result[0])
|
||||||
|
else:
|
||||||
|
failed_count += 1
|
||||||
|
logger.warning(f"Failed to add fragment {i + 1}: {result[0] if isinstance(result, list) else result}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Batch submission to aria2c failed: {e}", exc_info=args.verbose)
|
||||||
|
return 1
|
||||||
|
if failed_count > 0:
|
||||||
|
logger.warning(f"{failed_count} out of {len(fragments)} fragments failed to be added to aria2c.")
|
||||||
|
if not gids:
|
||||||
|
logger.error("Failed to add any fragments to aria2c. All submissions failed.")
|
||||||
|
return 1
|
||||||
|
print(f"Successfully added {len(gids)} fragments. GIDs: {gids}\nThese fragments will need to be merged manually after download.")
|
||||||
|
return 0
|
||||||
|
|
||||||
downloads = api.get_downloads(gids)
|
# Synchronous (--wait) mode with WebSockets
|
||||||
downloads_to_cleanup = downloads # Store for potential cleanup
|
MAX_CONCURRENT_FRAGMENTS = args.max_concurrent_fragments
|
||||||
# A download is considered "active" if it's currently downloading or waiting in the queue.
|
all_gids, failed_submission_count = [], 0
|
||||||
# It is "not active" if it is complete, errored, paused, or removed.
|
submitted_gids, completed_gids = set(), set()
|
||||||
active_downloads = [d for d in downloads if d.status in ('active', 'waiting')]
|
lock = threading.Lock()
|
||||||
if not active_downloads:
|
pending_fragments = list(enumerate(multicall_payload))
|
||||||
break # All downloads are complete or have stopped for other reasons
|
total_fragment_count = len(pending_fragments)
|
||||||
|
|
||||||
|
logger.info(f"Waiting for {total_fragment_count} fragments to complete using WebSocket events...")
|
||||||
|
logger.info(f"Will maintain up to {MAX_CONCURRENT_FRAGMENTS} active fragment downloads.")
|
||||||
|
|
||||||
for d in active_downloads:
|
def on_event(api_ref, event_gid):
|
||||||
d.update()
|
with lock:
|
||||||
|
if event_gid in submitted_gids:
|
||||||
|
completed_gids.add(event_gid)
|
||||||
|
|
||||||
completed_count = len(downloads) - len(active_downloads)
|
listener_thread = threading.Thread(
|
||||||
total_bytes = sum(d.total_length for d in downloads)
|
target=api.listen_to_notifications,
|
||||||
downloaded_bytes = sum(d.completed_length for d in downloads)
|
kwargs={'on_download_complete': on_event, 'on_download_error': on_event, 'on_download_stop': on_event, 'timeout': 1, 'handle_signals': False},
|
||||||
total_speed = sum(d.download_speed for d in downloads)
|
daemon=True
|
||||||
progress_percent = (downloaded_bytes / total_bytes * 100) if total_bytes > 0 else 0
|
)
|
||||||
|
listener_thread.start()
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
progress_info = (
|
try:
|
||||||
f"\rProgress: {completed_count}/{len(downloads)} fragments | "
|
while True:
|
||||||
f"{progress_percent:.1f}% "
|
with lock:
|
||||||
f"({human_readable_bytes(downloaded_bytes)}/{human_readable_bytes(total_bytes)}) "
|
if len(completed_gids) >= total_fragment_count:
|
||||||
f"Speed: {human_readable_bytes(total_speed)}/s"
|
break
|
||||||
)
|
if timeout_seconds and (time.time() - start_time > timeout_seconds):
|
||||||
sys.stdout.write(progress_info)
|
raise TimeoutError(f"Fragment downloads did not complete within {timeout_seconds}s timeout.")
|
||||||
sys.stdout.flush()
|
|
||||||
time.sleep(0.5)
|
|
||||||
except (KeyboardInterrupt, TimeoutError) as e:
|
|
||||||
sys.stdout.write('\n')
|
|
||||||
if isinstance(e, KeyboardInterrupt):
|
|
||||||
logger.warning("Wait interrupted by user. Cleaning up fragments...")
|
|
||||||
cleanup_aria_download(api, downloads_to_cleanup)
|
|
||||||
return 130
|
|
||||||
else: # TimeoutError
|
|
||||||
logger.error(f"Download timed out. Cleaning up fragments... Error: {e}")
|
|
||||||
cleanup_aria_download(api, downloads_to_cleanup)
|
|
||||||
return 1
|
|
||||||
except aria2p.ClientException as e:
|
|
||||||
# This can happen if downloads complete and are removed by aria2c
|
|
||||||
# before we can check their final status. Assume success in this case.
|
|
||||||
logger.warning(f"Could not get final status for some fragments (maybe removed on completion?): {e}. Assuming success.")
|
|
||||||
|
|
||||||
|
with lock:
|
||||||
|
active_gids_count = len(submitted_gids) - len(completed_gids)
|
||||||
|
num_to_submit = MAX_CONCURRENT_FRAGMENTS - active_gids_count
|
||||||
|
|
||||||
|
if num_to_submit > 0 and pending_fragments:
|
||||||
|
chunk_to_submit = pending_fragments[:num_to_submit]
|
||||||
|
pending_fragments = pending_fragments[num_to_submit:]
|
||||||
|
indices = [item[0] for item in chunk_to_submit]
|
||||||
|
payloads = [item[1] for item in chunk_to_submit]
|
||||||
|
try:
|
||||||
|
# The aria2p client library correctly handles authentication for multicalls.
|
||||||
|
results = api.client.multicall(payloads)
|
||||||
|
with lock:
|
||||||
|
for i, result in enumerate(results):
|
||||||
|
original_index = indices[i]
|
||||||
|
if isinstance(result, list) and len(result) == 1 and isinstance(result[0], str):
|
||||||
|
gid = result[0]
|
||||||
|
all_gids.append(gid)
|
||||||
|
submitted_gids.add(gid)
|
||||||
|
else:
|
||||||
|
failed_submission_count += 1
|
||||||
|
completed_gids.add(f"failed-submission-{original_index}")
|
||||||
|
logger.warning(f"Failed to add fragment {original_index + 1}: {result[0] if isinstance(result, list) else result}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Batch submission to aria2c failed for a chunk: {e}", exc_info=args.verbose)
|
||||||
|
with lock:
|
||||||
|
for i in indices:
|
||||||
|
failed_submission_count += 1
|
||||||
|
completed_gids.add(f"failed-submission-{i}")
|
||||||
|
|
||||||
|
with lock:
|
||||||
|
completed_download_count = len(completed_gids)
|
||||||
|
progress_percent = (completed_download_count / total_fragment_count * 100) if total_fragment_count > 0 else 0
|
||||||
|
sys.stdout.write(f"\rProgress: {completed_download_count}/{total_fragment_count} fragments | {progress_percent:.1f}%")
|
||||||
|
sys.stdout.flush()
|
||||||
|
time.sleep(0.5)
|
||||||
|
except (KeyboardInterrupt, TimeoutError) as e:
|
||||||
sys.stdout.write('\n')
|
sys.stdout.write('\n')
|
||||||
|
if isinstance(e, KeyboardInterrupt):
|
||||||
|
logger.warning("Wait interrupted by user. Cleaning up fragments...")
|
||||||
|
else:
|
||||||
|
logger.error(f"Download timed out. Cleaning up fragments... Error: {e}")
|
||||||
|
cleanup_aria_download(api, api.get_downloads(list(submitted_gids)))
|
||||||
|
return 130 if isinstance(e, KeyboardInterrupt) else 1
|
||||||
|
finally:
|
||||||
|
api.stop_listening()
|
||||||
|
if listener_thread.is_alive():
|
||||||
|
listener_thread.join(timeout=2)
|
||||||
|
|
||||||
# Final status check
|
sys.stdout.write('\n')
|
||||||
failed_downloads = []
|
if failed_submission_count > 0:
|
||||||
|
logger.error(f"{failed_submission_count} fragments failed to be submitted to aria2c.")
|
||||||
|
|
||||||
|
final_downloads = []
|
||||||
|
if all_gids:
|
||||||
try:
|
try:
|
||||||
downloads = api.get_downloads(gids)
|
final_downloads = api.get_downloads(all_gids)
|
||||||
failed_downloads = [d for d in downloads if d.status != 'complete']
|
|
||||||
except aria2p.ClientException as e:
|
except aria2p.ClientException as e:
|
||||||
logger.warning(f"Could not perform final status check for fragments (maybe removed on completion?): {e}. Assuming success.")
|
logger.warning(f"Could not perform final status check for fragments (maybe removed on completion?): {e}. Assuming success.")
|
||||||
# If we can't check, we assume success based on the earlier wait loop not failing catastrophically.
|
|
||||||
failed_downloads = []
|
|
||||||
|
|
||||||
if failed_downloads:
|
failed_downloads = [d for d in final_downloads if d.status != 'complete']
|
||||||
logger.error(f"{len(failed_downloads)} fragments failed to download.")
|
if failed_downloads:
|
||||||
for d in failed_downloads:
|
logger.error(f"{len(failed_downloads)} fragments failed to download.")
|
||||||
detailed_error = parse_aria_error(d)
|
for d in failed_downloads[:5]:
|
||||||
logger.error(f" GID {d.gid}: {detailed_error}")
|
logger.error(f" GID {d.gid}: {parse_aria_error(d)}")
|
||||||
return 1
|
if len(failed_downloads) > 5:
|
||||||
else:
|
logger.error(f" ... and {len(failed_downloads) - 5} more errors.")
|
||||||
logger.info("All fragments downloaded successfully.")
|
return 1
|
||||||
output_dir = args.output_dir or '.'
|
if failed_submission_count > 0:
|
||||||
final_filepath = os.path.join(output_dir, filename)
|
logger.error("Aborting due to fragment submission failures.")
|
||||||
fragments_lookup_dir = args.fragments_dir or output_dir
|
return 1
|
||||||
|
|
||||||
if args.auto_merge_fragments:
|
logger.info("All fragments downloaded successfully.")
|
||||||
logger.info(f"Attempting to merge fragments into: {final_filepath}")
|
output_dir = args.output_dir or '.'
|
||||||
logger.info(f"Searching for fragments in local directory: {os.path.abspath(fragments_lookup_dir)}")
|
final_filepath = os.path.join(output_dir, filename)
|
||||||
|
fragments_lookup_dir = args.fragments_dir or output_dir
|
||||||
|
|
||||||
|
if args.auto_merge_fragments:
|
||||||
|
logger.info(f"Attempting to merge fragments into: {final_filepath}")
|
||||||
|
logger.info(f"Searching for fragments in local directory: {os.path.abspath(fragments_lookup_dir)}")
|
||||||
|
try:
|
||||||
|
escaped_base = glob.escape(base_filename)
|
||||||
|
search_path = os.path.join(fragments_lookup_dir, f"{escaped_base}-Frag*{file_ext}")
|
||||||
|
fragment_files = sorted(glob.glob(search_path), key=lambda f: int(re.search(r'Frag(\d+)', os.path.basename(f)).group(1)))
|
||||||
|
if not fragment_files:
|
||||||
|
logger.error(f"No fragment files found with pattern: {search_path}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
with open(final_filepath, 'wb') as dest_file:
|
||||||
|
for frag_path in fragment_files:
|
||||||
|
with open(frag_path, 'rb') as src_file:
|
||||||
|
shutil.copyfileobj(src_file, dest_file)
|
||||||
|
logger.info(f"Successfully merged {len(fragment_files)} fragments into {final_filepath}")
|
||||||
|
|
||||||
|
if args.remove_fragments_after_merge or args.cleanup:
|
||||||
|
logger.info("Removing fragment files...")
|
||||||
|
for frag_path in fragment_files: os.remove(frag_path)
|
||||||
|
logger.info("Fragment files removed.")
|
||||||
|
if args.cleanup:
|
||||||
try:
|
try:
|
||||||
# base_filename and file_ext are available from earlier in the function
|
os.remove(final_filepath)
|
||||||
# We must escape the base filename in case it contains glob special characters like [ or ].
|
logger.info(f"Cleanup: Removed merged file '{final_filepath}'")
|
||||||
escaped_base = glob.escape(base_filename)
|
except OSError as e:
|
||||||
search_path = os.path.join(fragments_lookup_dir, f"{escaped_base}-Frag*{file_ext}")
|
logger.error(f"Cleanup failed: Could not remove merged file '{final_filepath}': {e}")
|
||||||
fragment_files = glob.glob(search_path)
|
|
||||||
|
|
||||||
if not fragment_files:
|
print(f"Download and merge successful: {final_filepath}")
|
||||||
logger.error(f"No fragment files found with pattern: {search_path}")
|
|
||||||
return 1
|
|
||||||
|
|
||||||
def fragment_sort_key(f):
|
|
||||||
match = re.search(r'Frag(\d+)', os.path.basename(f))
|
|
||||||
return int(match.group(1)) if match else -1
|
|
||||||
fragment_files.sort(key=fragment_sort_key)
|
|
||||||
|
|
||||||
with open(final_filepath, 'wb') as dest_file:
|
|
||||||
for frag_path in fragment_files:
|
|
||||||
with open(frag_path, 'rb') as src_file:
|
|
||||||
shutil.copyfileobj(src_file, dest_file)
|
|
||||||
|
|
||||||
logger.info(f"Successfully merged {len(fragment_files)} fragments into {final_filepath}")
|
|
||||||
|
|
||||||
if args.remove_fragments_after_merge or args.cleanup:
|
|
||||||
logger.info("Removing fragment files...")
|
|
||||||
for frag_path in fragment_files:
|
|
||||||
os.remove(frag_path)
|
|
||||||
logger.info("Fragment files removed.")
|
|
||||||
|
|
||||||
if args.cleanup:
|
|
||||||
try:
|
|
||||||
os.remove(final_filepath)
|
|
||||||
logger.info(f"Cleanup: Removed merged file '{final_filepath}'")
|
|
||||||
except OSError as e:
|
|
||||||
logger.error(f"Cleanup failed: Could not remove merged file '{final_filepath}': {e}")
|
|
||||||
|
|
||||||
print(f"Download and merge successful: {final_filepath}")
|
|
||||||
|
|
||||||
if args.purge_on_complete:
|
|
||||||
try:
|
|
||||||
api.purge_download_result()
|
|
||||||
logger.info("Purged all completed/failed downloads from aria2c history.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to purge download history: {e}")
|
|
||||||
elif args.remove_on_complete:
|
|
||||||
try:
|
|
||||||
# The `downloads` variable from the last status check should be valid here.
|
|
||||||
api.remove_download_result(downloads)
|
|
||||||
logger.info(f"Removed {len(downloads)} fragment downloads from aria2c history.")
|
|
||||||
except aria2p.ClientException as e:
|
|
||||||
logger.warning(f"Could not remove fragment downloads from history (maybe already gone?): {e}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to remove fragment downloads from history: {e}")
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
if args.purge_on_complete:
|
||||||
|
try:
|
||||||
|
api.purge_download_result()
|
||||||
|
logger.info("Purged all completed/failed downloads from aria2c history.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"An error occurred during merging: {e}", exc_info=args.verbose)
|
logger.warning(f"Failed to purge download history: {e}")
|
||||||
logger.error("Fragments were downloaded but not merged.")
|
elif args.remove_on_complete:
|
||||||
return 1
|
try:
|
||||||
else:
|
for d in final_downloads:
|
||||||
print("Download successful. Fragments now need to be merged manually.")
|
try: api.client.remove_download_result(d.gid)
|
||||||
print(f"The final merged file should be named: {final_filepath}")
|
except aria2p.ClientException: pass
|
||||||
print("You can merge them with a command like:")
|
logger.info(f"Removed {len(final_downloads)} fragment downloads from aria2c history.")
|
||||||
print(f" cat `ls -v '{os.path.join(fragments_lookup_dir, base_filename)}'-Frag*'{file_ext}'` > '{final_filepath}'")
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to remove fragment downloads from history: {e}")
|
||||||
if args.cleanup:
|
return 0
|
||||||
logger.info("Cleanup requested. Removing downloaded fragments...")
|
except Exception as e:
|
||||||
try:
|
logger.error(f"An error occurred during merging: {e}", exc_info=args.verbose)
|
||||||
# base_filename and file_ext are available from earlier in the function
|
logger.error("Fragments were downloaded but not merged.")
|
||||||
escaped_base = glob.escape(base_filename)
|
return 1
|
||||||
search_path = os.path.join(fragments_lookup_dir, f"{escaped_base}-Frag*{file_ext}")
|
|
||||||
fragment_files = glob.glob(search_path)
|
|
||||||
|
|
||||||
if not fragment_files:
|
|
||||||
logger.warning(f"Cleanup: No fragment files found with pattern: {search_path}")
|
|
||||||
else:
|
|
||||||
for frag_path in fragment_files:
|
|
||||||
os.remove(frag_path)
|
|
||||||
logger.info(f"Removed {len(fragment_files)} fragment files.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"An error occurred during fragment cleanup: {e}", exc_info=args.verbose)
|
|
||||||
|
|
||||||
if args.purge_on_complete:
|
|
||||||
try:
|
|
||||||
api.purge_download_result()
|
|
||||||
logger.info("Purged all completed/failed downloads from aria2c history.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to purge download history: {e}")
|
|
||||||
elif args.remove_on_complete:
|
|
||||||
try:
|
|
||||||
# The `downloads` variable from the last status check should be valid here.
|
|
||||||
api.remove_download_result(downloads)
|
|
||||||
logger.info(f"Removed {len(downloads)} fragment downloads from aria2c history.")
|
|
||||||
except aria2p.ClientException as e:
|
|
||||||
logger.warning(f"Could not remove fragment downloads from history (maybe already gone?): {e}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to remove fragment downloads from history: {e}")
|
|
||||||
|
|
||||||
return 0
|
|
||||||
else:
|
else:
|
||||||
print(f"Successfully added {len(gids)} fragments. GIDs: {gids}")
|
print(f"Download successful. Fragments now need to be merged manually.\nThe final merged file should be named: {final_filepath}")
|
||||||
print("These fragments will need to be merged manually after download.")
|
print(f"You can merge them with a command like:\n cat `ls -v '{os.path.join(fragments_lookup_dir, base_filename)}'-Frag*'{file_ext}'` > '{final_filepath}'")
|
||||||
|
if args.cleanup:
|
||||||
|
logger.info("Cleanup requested. Removing downloaded fragments...")
|
||||||
|
try:
|
||||||
|
escaped_base = glob.escape(base_filename)
|
||||||
|
search_path = os.path.join(fragments_lookup_dir, f"{escaped_base}-Frag*{file_ext}")
|
||||||
|
fragment_files = glob.glob(search_path)
|
||||||
|
if not fragment_files:
|
||||||
|
logger.warning(f"Cleanup: No fragment files found with pattern: {search_path}")
|
||||||
|
else:
|
||||||
|
for frag_path in fragment_files: os.remove(frag_path)
|
||||||
|
logger.info(f"Removed {len(fragment_files)} fragment files.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"An error occurred during fragment cleanup: {e}", exc_info=args.verbose)
|
||||||
|
if args.purge_on_complete:
|
||||||
|
try:
|
||||||
|
api.purge_download_result()
|
||||||
|
logger.info("Purged all completed/failed downloads from aria2c history.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to purge download history: {e}")
|
||||||
|
elif args.remove_on_complete:
|
||||||
|
try:
|
||||||
|
api.remove_download_result(final_downloads)
|
||||||
|
logger.info(f"Removed {len(final_downloads)} fragment downloads from aria2c history.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not remove fragment downloads from history (maybe already gone?): {e}")
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
@ -84,12 +84,19 @@ def add_download_native_py_parser(subparsers):
|
|||||||
parser.add_argument('--output-buffer', action='store_true', help='Download to an in-memory buffer and print raw bytes to stdout. Final filename is printed to stderr.')
|
parser.add_argument('--output-buffer', action='store_true', help='Download to an in-memory buffer and print raw bytes to stdout. Final filename is printed to stderr.')
|
||||||
parser.add_argument('--cleanup', action='store_true', help='After download, rename the file to include a timestamp and truncate it to 0 bytes.')
|
parser.add_argument('--cleanup', action='store_true', help='After download, rename the file to include a timestamp and truncate it to 0 bytes.')
|
||||||
parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.')
|
parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.')
|
||||||
|
parser.add_argument('--retries', type=int, help='Number of retries for the entire download (default: 10).')
|
||||||
|
parser.add_argument('--fragment-retries', type=int, help='Number of retries for each fragment (default: 10).')
|
||||||
|
parser.add_argument('--socket-timeout', type=int, help='Timeout for socket operations in seconds (default: 20).')
|
||||||
|
parser.add_argument('--add-header', action='append', help='Add a custom HTTP header for the download. Format: "Key: Value". Can be used multiple times.')
|
||||||
|
# Arguments to pass through to yt-dlp
|
||||||
|
parser.add_argument('--download-sections', help='yt-dlp --download-sections argument (e.g., "*0-10240").')
|
||||||
|
parser.add_argument('--test', action='store_true', help='yt-dlp --test argument (download small part).')
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def main_download_native_py(args):
|
def main_download_native_py(args):
|
||||||
"""Main logic for the 'download-native-py' command."""
|
"""Main logic for the 'download-native-py' command."""
|
||||||
# If outputting to buffer, all logging must go to stderr to keep stdout clean for binary data.
|
# All logging should go to stderr to keep stdout clean for the final filename, or for binary data with --output-buffer.
|
||||||
log_stream = sys.stderr if args.output_buffer else sys.stdout
|
log_stream = sys.stderr
|
||||||
log_level = logging.DEBUG if args.verbose else logging.INFO
|
log_level = logging.DEBUG if args.verbose else logging.INFO
|
||||||
# Reconfigure root logger
|
# Reconfigure root logger
|
||||||
for handler in logging.root.handlers[:]:
|
for handler in logging.root.handlers[:]:
|
||||||
@ -176,7 +183,10 @@ def main_download_native_py(args):
|
|||||||
logger.info(f"Adding {len(extra_args_list)} extra arguments from --extra-ytdlp-args.")
|
logger.info(f"Adding {len(extra_args_list)} extra arguments from --extra-ytdlp-args.")
|
||||||
base_opts_args.extend(extra_args_list)
|
base_opts_args.extend(extra_args_list)
|
||||||
|
|
||||||
ydl_opts = {}
|
ydl_opts = {
|
||||||
|
'noresizebuffer': True,
|
||||||
|
'buffersize': '4M',
|
||||||
|
}
|
||||||
if base_opts_args:
|
if base_opts_args:
|
||||||
try:
|
try:
|
||||||
logger.info(f"Parsing {len(base_opts_args)} arguments from config/extra_args...")
|
logger.info(f"Parsing {len(base_opts_args)} arguments from config/extra_args...")
|
||||||
@ -192,6 +202,17 @@ def main_download_native_py(args):
|
|||||||
|
|
||||||
# Handle flags (no value)
|
# Handle flags (no value)
|
||||||
is_flag = i + 1 >= len(base_opts_args) or base_opts_args[i + 1].startswith('--')
|
is_flag = i + 1 >= len(base_opts_args) or base_opts_args[i + 1].startswith('--')
|
||||||
|
|
||||||
|
if key == 'resize_buffer':
|
||||||
|
ydl_opts['noresizebuffer'] = False
|
||||||
|
logger.debug(f"Parsed flag: noresizebuffer = False")
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
elif key == 'no_resize_buffer':
|
||||||
|
ydl_opts['noresizebuffer'] = True
|
||||||
|
logger.debug(f"Parsed flag: noresizebuffer = True")
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
if is_flag:
|
if is_flag:
|
||||||
if key.startswith('no_'):
|
if key.startswith('no_'):
|
||||||
@ -229,6 +250,8 @@ def main_download_native_py(args):
|
|||||||
# Special handling for keys that differ from CLI arg, e.g. --limit-rate -> ratelimit
|
# Special handling for keys that differ from CLI arg, e.g. --limit-rate -> ratelimit
|
||||||
if key == 'limit_rate':
|
if key == 'limit_rate':
|
||||||
key = 'ratelimit'
|
key = 'ratelimit'
|
||||||
|
elif key == 'buffer_size':
|
||||||
|
key = 'buffersize'
|
||||||
|
|
||||||
ydl_opts[key] = value
|
ydl_opts[key] = value
|
||||||
logger.debug(f"Parsed option: {key} = {value}")
|
logger.debug(f"Parsed option: {key} = {value}")
|
||||||
@ -257,6 +280,21 @@ def main_download_native_py(args):
|
|||||||
ydl_opts['paths'] = {'temp': args.temp_path}
|
ydl_opts['paths'] = {'temp': args.temp_path}
|
||||||
logger.info(f"Using temporary path: {args.temp_path}")
|
logger.info(f"Using temporary path: {args.temp_path}")
|
||||||
|
|
||||||
|
if args.add_header:
|
||||||
|
if 'http_headers' not in ydl_opts:
|
||||||
|
ydl_opts['http_headers'] = {}
|
||||||
|
elif not isinstance(ydl_opts['http_headers'], dict):
|
||||||
|
logger.warning(f"Overwriting non-dictionary http_headers from config with headers from command line.")
|
||||||
|
ydl_opts['http_headers'] = {}
|
||||||
|
|
||||||
|
for header in args.add_header:
|
||||||
|
if ':' not in header:
|
||||||
|
logger.error(f"Invalid header format in --add-header: '{header}'. Expected 'Key: Value'.")
|
||||||
|
return 1
|
||||||
|
key, value = header.split(':', 1)
|
||||||
|
ydl_opts['http_headers'][key.strip()] = value.strip()
|
||||||
|
logger.info(f"Adding/overwriting header: {key.strip()}: {value.strip()}")
|
||||||
|
|
||||||
if args.download_continue:
|
if args.download_continue:
|
||||||
ydl_opts['continuedl'] = True
|
ydl_opts['continuedl'] = True
|
||||||
ydl_opts['nooverwrites'] = True
|
ydl_opts['nooverwrites'] = True
|
||||||
@ -279,6 +317,19 @@ def main_download_native_py(args):
|
|||||||
if args.merge_output_format:
|
if args.merge_output_format:
|
||||||
ydl_opts['merge_output_format'] = args.merge_output_format
|
ydl_opts['merge_output_format'] = args.merge_output_format
|
||||||
|
|
||||||
|
if args.download_sections:
|
||||||
|
ydl_opts['download_sections'] = args.download_sections
|
||||||
|
|
||||||
|
if args.test:
|
||||||
|
ydl_opts['test'] = True
|
||||||
|
|
||||||
|
if args.retries is not None:
|
||||||
|
ydl_opts['retries'] = args.retries
|
||||||
|
if args.fragment_retries is not None:
|
||||||
|
ydl_opts['fragment_retries'] = args.fragment_retries
|
||||||
|
if args.socket_timeout is not None:
|
||||||
|
ydl_opts['socket_timeout'] = args.socket_timeout
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.info(f"Starting download for format '{args.format}' using yt-dlp library...")
|
logger.info(f"Starting download for format '{args.format}' using yt-dlp library...")
|
||||||
|
|
||||||
@ -301,6 +352,13 @@ def main_download_native_py(args):
|
|||||||
|
|
||||||
# The success path is now always taken if no exception was raised.
|
# The success path is now always taken if no exception was raised.
|
||||||
if retcode == 0:
|
if retcode == 0:
|
||||||
|
if ytdlp_logger.is_403:
|
||||||
|
logger.error("Download failed: yt-dlp reported HTTP Error 403: Forbidden. The URL has likely expired.")
|
||||||
|
return 1
|
||||||
|
if ytdlp_logger.is_timeout:
|
||||||
|
logger.error("Download failed: yt-dlp reported a timeout.")
|
||||||
|
return 1
|
||||||
|
|
||||||
logger.info("yt-dlp download completed successfully.")
|
logger.info("yt-dlp download completed successfully.")
|
||||||
|
|
||||||
if args.output_buffer:
|
if args.output_buffer:
|
||||||
|
|||||||
@ -44,6 +44,14 @@ def add_download_parser(subparsers):
|
|||||||
parser.add_argument('--downloader', help='Name of the external downloader to use (e.g., "aria2c", "native").')
|
parser.add_argument('--downloader', help='Name of the external downloader to use (e.g., "aria2c", "native").')
|
||||||
parser.add_argument('--downloader-args', help='Arguments to pass to the external downloader (e.g., "aria2c:-x 8").')
|
parser.add_argument('--downloader-args', help='Arguments to pass to the external downloader (e.g., "aria2c:-x 8").')
|
||||||
parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.')
|
parser.add_argument('--merge-output-format', help='Container format to merge to (e.g., "mp4", "mkv"). Overrides config file.')
|
||||||
|
parser.add_argument('--retries', help='Number of retries for the entire download (default: 10).')
|
||||||
|
parser.add_argument('--fragment-retries', help='Number of retries for each fragment (default: 10).')
|
||||||
|
parser.add_argument('--socket-timeout', help='Timeout for socket operations in seconds (default: 20).')
|
||||||
|
parser.add_argument('--lang', help='Language code for the request (e.g., "fr", "ja"). Affects metadata language.')
|
||||||
|
parser.add_argument('--timezone', help='Timezone for the request (e.g., "UTC", "America/New_York"). Note: not supported by yt-dlp.')
|
||||||
|
# Arguments to pass through to yt-dlp
|
||||||
|
parser.add_argument('--download-sections', help='yt-dlp --download-sections argument (e.g., "*0-10240").')
|
||||||
|
parser.add_argument('--test', action='store_true', help='yt-dlp --test argument (download small part).')
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def main_download(args):
|
def main_download(args):
|
||||||
@ -151,6 +159,19 @@ def main_download(args):
|
|||||||
if args.merge_output_format:
|
if args.merge_output_format:
|
||||||
cmd.extend(['--merge-output-format', args.merge_output_format])
|
cmd.extend(['--merge-output-format', args.merge_output_format])
|
||||||
|
|
||||||
|
if args.download_sections:
|
||||||
|
cmd.extend(['--download-sections', args.download_sections])
|
||||||
|
|
||||||
|
if args.test:
|
||||||
|
cmd.append('--test')
|
||||||
|
|
||||||
|
if args.retries:
|
||||||
|
cmd.extend(['--retries', str(args.retries)])
|
||||||
|
if args.fragment_retries:
|
||||||
|
cmd.extend(['--fragment-retries', str(args.fragment_retries)])
|
||||||
|
if args.socket_timeout:
|
||||||
|
cmd.extend(['--socket-timeout', str(args.socket_timeout)])
|
||||||
|
|
||||||
if args.download_continue:
|
if args.download_continue:
|
||||||
cmd.extend(['--continue', '--part'])
|
cmd.extend(['--continue', '--part'])
|
||||||
|
|
||||||
@ -172,6 +193,12 @@ def main_download(args):
|
|||||||
if proxy_url:
|
if proxy_url:
|
||||||
cmd.extend(['--proxy', proxy_url])
|
cmd.extend(['--proxy', proxy_url])
|
||||||
|
|
||||||
|
if args.lang:
|
||||||
|
cmd.extend(['--extractor-args', f'youtube:lang={args.lang}'])
|
||||||
|
|
||||||
|
if args.timezone:
|
||||||
|
logger.warning(f"Timezone override ('{args.timezone}') is not supported by yt-dlp and will be ignored.")
|
||||||
|
|
||||||
# Determine if we need to capture output.
|
# Determine if we need to capture output.
|
||||||
capture_output = args.cleanup or args.log_file or args.print_traffic
|
capture_output = args.cleanup or args.log_file or args.print_traffic
|
||||||
|
|
||||||
@ -208,6 +235,16 @@ def main_download(args):
|
|||||||
stdout_data, stderr_data = process.communicate()
|
stdout_data, stderr_data = process.communicate()
|
||||||
return_code = process.returncode
|
return_code = process.returncode
|
||||||
|
|
||||||
|
# Post-run check for silent failures, like 403 errors where yt-dlp might still exit 0.
|
||||||
|
if return_code == 0:
|
||||||
|
output_text = (stdout_data or "") + (stderr_data or "")
|
||||||
|
if "HTTP Error 403" in output_text:
|
||||||
|
logger.error("yt-dlp exited successfully, but a 403 error was detected in its output. Forcing failure.")
|
||||||
|
return_code = 1 # Override success code
|
||||||
|
elif "timed out" in output_text.lower() or "timeout" in output_text.lower():
|
||||||
|
logger.error("yt-dlp exited successfully, but a timeout was detected in its output. Forcing failure.")
|
||||||
|
return_code = 1
|
||||||
|
|
||||||
# Write captured output to terminal and log file
|
# Write captured output to terminal and log file
|
||||||
if stdout_data:
|
if stdout_data:
|
||||||
sys.stdout.write(stdout_data)
|
sys.stdout.write(stdout_data)
|
||||||
|
|||||||
@ -124,7 +124,9 @@ the browser-based generation strategy.''')
|
|||||||
parser.add_argument('--direct', action='store_true', help='Use the direct yt-dlp info.json generation method, bypassing Node.js token generation.')
|
parser.add_argument('--direct', action='store_true', help='Use the direct yt-dlp info.json generation method, bypassing Node.js token generation.')
|
||||||
parser.add_argument('--print-info-out', action='store_true', help='Print the final info.json to stdout. By default, output is suppressed unless writing to a file.')
|
parser.add_argument('--print-info-out', action='store_true', help='Print the final info.json to stdout. By default, output is suppressed unless writing to a file.')
|
||||||
parser.add_argument('--request-params-json', help=REQUEST_PARAMS_HELP_STRING + '\nCan also be a comma-separated string of key=value pairs (e.g., "caching_policy.mode=force_refresh").')
|
parser.add_argument('--request-params-json', help=REQUEST_PARAMS_HELP_STRING + '\nCan also be a comma-separated string of key=value pairs (e.g., "caching_policy.mode=force_refresh").')
|
||||||
parser.add_argument('--force-renew', help='Comma-separated list of items to force-renew: cookies, visitor_id, po_token, nsig_cache, all.')
|
parser.add_argument('--force-renew', help='Comma-separated list of items to force-renew: cookies, visitor_id, po_token, nsig_cache, info_json, all.')
|
||||||
|
parser.add_argument('--lang', help='Language code for the request (e.g., "fr", "ja"). Affects metadata language.')
|
||||||
|
parser.add_argument('--timezone', help='Timezone for the request (e.g., "UTC", "America/New_York"). Note: experimental, may not be fully supported.')
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def main_get_info(args):
|
def main_get_info(args):
|
||||||
@ -188,6 +190,16 @@ def main_get_info(args):
|
|||||||
items_to_renew = [item.strip() for item in args.force_renew.split(',')]
|
items_to_renew = [item.strip() for item in args.force_renew.split(',')]
|
||||||
request_params['force_renew'] = items_to_renew
|
request_params['force_renew'] = items_to_renew
|
||||||
logger.info(f"Requesting force renew for: {items_to_renew}")
|
logger.info(f"Requesting force renew for: {items_to_renew}")
|
||||||
|
|
||||||
|
if args.lang:
|
||||||
|
session_params = request_params.setdefault('session_params', {})
|
||||||
|
session_params['lang'] = args.lang
|
||||||
|
logger.info(f"Requesting language: {args.lang}")
|
||||||
|
|
||||||
|
if args.timezone:
|
||||||
|
session_params = request_params.setdefault('session_params', {})
|
||||||
|
session_params['timeZone'] = args.timezone
|
||||||
|
logger.info(f"Requesting timezone: {args.timezone}")
|
||||||
|
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
# Add verbose flag for yt-dlp on the server
|
# Add verbose flag for yt-dlp on the server
|
||||||
@ -244,6 +256,15 @@ def main_get_info(args):
|
|||||||
|
|
||||||
if not token_data or not hasattr(token_data, 'infoJson') or not token_data.infoJson:
|
if not token_data or not hasattr(token_data, 'infoJson') or not token_data.infoJson:
|
||||||
logger.error("Server did not return valid info.json data.")
|
logger.error("Server did not return valid info.json data.")
|
||||||
|
if args.verbose:
|
||||||
|
logger.debug(f"Received token_data from server: {token_data!r}")
|
||||||
|
if not token_data:
|
||||||
|
logger.error("Reason: The entire token_data object received from the server is null.")
|
||||||
|
elif not hasattr(token_data, 'infoJson'):
|
||||||
|
logger.error("Reason: The received token_data object does not have an 'infoJson' attribute.")
|
||||||
|
elif not token_data.infoJson:
|
||||||
|
logger.error("Reason: The 'infoJson' attribute in the received token_data object is empty or null.")
|
||||||
|
|
||||||
print("Error: Server did not return valid info.json data.", file=sys.stderr)
|
print("Error: Server did not return valid info.json data.", file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|||||||
@ -9,6 +9,11 @@ import re
|
|||||||
from urllib.parse import urlparse, parse_qs
|
from urllib.parse import urlparse, parse_qs
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yt_dlp
|
||||||
|
except ImportError:
|
||||||
|
yt_dlp = None
|
||||||
|
|
||||||
def format_size(b):
|
def format_size(b):
|
||||||
"""Format size in bytes to human-readable string."""
|
"""Format size in bytes to human-readable string."""
|
||||||
if b is None:
|
if b is None:
|
||||||
@ -32,9 +37,39 @@ def list_formats(info_json, requested_formats_str=None, file=sys.stdout):
|
|||||||
requested_formats = []
|
requested_formats = []
|
||||||
requested_order = {}
|
requested_order = {}
|
||||||
if requested_formats_str:
|
if requested_formats_str:
|
||||||
# Split by comma or slash, and filter out empty strings
|
if yt_dlp:
|
||||||
requested_formats = [item for item in re.split(r'[,/]', requested_formats_str) if item]
|
try:
|
||||||
requested_order = {fmt: i for i, fmt in enumerate(requested_formats)}
|
ydl = yt_dlp.YoutubeDL({'quiet': True})
|
||||||
|
formats = info_json.get('formats', [])
|
||||||
|
selector = ydl.build_format_selector(requested_formats_str)
|
||||||
|
ctx = {
|
||||||
|
'formats': formats,
|
||||||
|
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
|
||||||
|
'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats)
|
||||||
|
or all(f.get('acodec') == 'none' for f in formats)),
|
||||||
|
}
|
||||||
|
selected_formats = list(selector(ctx))
|
||||||
|
|
||||||
|
all_selected_ids = []
|
||||||
|
for f in selected_formats:
|
||||||
|
if 'requested_formats' in f:
|
||||||
|
all_selected_ids.extend(rf['format_id'] for rf in f['requested_formats'])
|
||||||
|
else:
|
||||||
|
all_selected_ids.append(f['format_id'])
|
||||||
|
|
||||||
|
requested_formats = all_selected_ids
|
||||||
|
requested_order = {fmt: i for i, fmt in enumerate(requested_formats)}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"WARNING: Could not parse format selector '{requested_formats_str}': {e}", file=sys.stderr)
|
||||||
|
# Fallback to simple parsing
|
||||||
|
requested_formats = [item for item in re.split(r'[,/]', requested_formats_str) if item]
|
||||||
|
requested_order = {fmt: i for i, fmt in enumerate(requested_formats)}
|
||||||
|
else:
|
||||||
|
# Fallback to simple parsing if yt-dlp is not installed
|
||||||
|
print("WARNING: yt-dlp not installed. Using simple format selector parsing.", file=sys.stderr)
|
||||||
|
requested_formats = [item for item in re.split(r'[,/]', requested_formats_str) if item]
|
||||||
|
requested_order = {fmt: i for i, fmt in enumerate(requested_formats)}
|
||||||
|
|
||||||
def sort_key(f):
|
def sort_key(f):
|
||||||
fid = f.get('format_id', '')
|
fid = f.get('format_id', '')
|
||||||
|
|||||||
@ -20,6 +20,7 @@ Example of a full configuration JSON showing default values (use single quotes t
|
|||||||
"use_curl_prefetch": false,
|
"use_curl_prefetch": false,
|
||||||
"skip_cache": false,
|
"skip_cache": false,
|
||||||
"visitor_id_override_enabled": true,
|
"visitor_id_override_enabled": true,
|
||||||
|
"webpo_bind_to_visitor_id": true,
|
||||||
"extractor_args": {
|
"extractor_args": {
|
||||||
"youtubepot-bgutilhttp": {
|
"youtubepot-bgutilhttp": {
|
||||||
"base_url": "http://172.17.0.1:4416"
|
"base_url": "http://172.17.0.1:4416"
|
||||||
@ -28,21 +29,22 @@ Example of a full configuration JSON showing default values (use single quotes t
|
|||||||
"pot_trace": "true",
|
"pot_trace": "true",
|
||||||
"formats": "duplicate",
|
"formats": "duplicate",
|
||||||
"player_js_version": "actual"
|
"player_js_version": "actual"
|
||||||
},
|
|
||||||
"youtubepot-webpo": {
|
|
||||||
"bind_to_visitor_id": "true"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"_comment_ytdlp_params": "Parameters passed directly to the yt-dlp wrapper for info.json generation.",
|
"_comment_ytdlp_params": "Parameters passed directly to the yt-dlp wrapper for info.json generation.",
|
||||||
|
"_comment_webpo_bind_to_visitor_id": "If true (default), binds the PO Token cache to the visitor ID. Set to false for TV clients if caching issues occur, as this is not recommended for them.",
|
||||||
"_comment_visitor_id_override_enabled": "If true (default), the server validates the visitor ID from the token generator and creates a new one if it is invalid. Set to false to force using the provided visitor ID without validation, which is useful for debugging.",
|
"_comment_visitor_id_override_enabled": "If true (default), the server validates the visitor ID from the token generator and creates a new one if it is invalid. Set to false to force using the provided visitor ID without validation, which is useful for debugging.",
|
||||||
"_comment_extractor_args": "Directly override yt-dlp extractor arguments. To use BGUtils in script mode, replace 'youtubepot-bgutilhttp' with 'youtubepot-bgutilscript'. The script path is '/opt/bgutil-ytdlp-pot-provider-server/build/generate_once.js'. To disable any explicit provider (like '--bgutils-mode none' on the server), remove both 'youtubepot-bgutilhttp' and 'youtubepot-bgutilscript' keys.",
|
"_comment_extractor_args": "Directly override yt-dlp extractor arguments. To use BGUtils in script mode, replace 'youtubepot-bgutilhttp' with 'youtubepot-bgutilscript'. The script path is '/opt/bgutil-ytdlp-pot-provider-server/build/generate_once.js'. To disable any explicit provider (like '--bgutils-mode none' on the server), remove both 'youtubepot-bgutilhttp' and 'youtubepot-bgutilscript' keys.",
|
||||||
|
|
||||||
"session_params": {
|
"session_params": {
|
||||||
"lang": "en-US",
|
"lang": "en-US",
|
||||||
|
"timeZone": "UTC",
|
||||||
"location": "US",
|
"location": "US",
|
||||||
"deviceCategory": "MOBILE",
|
"deviceCategory": "MOBILE",
|
||||||
"user_agent": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)"
|
"user_agent": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)",
|
||||||
|
"visitor_rotation_threshold": 250
|
||||||
},
|
},
|
||||||
"_comment_session_params": "Parameters for the token generation session (primarily for Node.js)."
|
"_comment_session_params": "Parameters for the token generation session. `visitor_rotation_threshold` overrides the server's default request limit before a profile's visitor ID is rotated. Set to 0 to disable rotation.",
|
||||||
|
"_comment_lang_and_tz": "`lang` sets the 'hl' parameter for YouTube's API, affecting metadata language. `timeZone` is intended to set the timezone for requests, but is not fully supported by yt-dlp yet."
|
||||||
}'"""
|
}'"""
|
||||||
|
|||||||
@ -148,7 +148,8 @@ def get_profile_from_filename(path, regex_pattern):
|
|||||||
|
|
||||||
class StateManager:
|
class StateManager:
|
||||||
"""Tracks statistics, manages rate limits, and persists state across runs."""
|
"""Tracks statistics, manages rate limits, and persists state across runs."""
|
||||||
def __init__(self, policy_name):
|
def __init__(self, policy_name, disable_log_writing=False):
|
||||||
|
self.disable_log_writing = disable_log_writing
|
||||||
self.state_file_path = Path(f"{policy_name}_state.json")
|
self.state_file_path = Path(f"{policy_name}_state.json")
|
||||||
self.stats_file_path = Path(f"{policy_name}_stats.jsonl")
|
self.stats_file_path = Path(f"{policy_name}_stats.jsonl")
|
||||||
self.lock = threading.RLock()
|
self.lock = threading.RLock()
|
||||||
@ -174,6 +175,9 @@ class StateManager:
|
|||||||
self._open_stats_log()
|
self._open_stats_log()
|
||||||
|
|
||||||
def _load_state(self):
|
def _load_state(self):
|
||||||
|
if self.disable_log_writing:
|
||||||
|
logger.info("Log writing is disabled. State will not be loaded from disk.")
|
||||||
|
return
|
||||||
if not self.state_file_path.exists():
|
if not self.state_file_path.exists():
|
||||||
logger.info(f"State file not found at '{self.state_file_path}', starting fresh.")
|
logger.info(f"State file not found at '{self.state_file_path}', starting fresh.")
|
||||||
return
|
return
|
||||||
@ -198,6 +202,8 @@ class StateManager:
|
|||||||
logger.error(f"Could not load or parse state file {self.state_file_path}: {e}. Starting fresh.")
|
logger.error(f"Could not load or parse state file {self.state_file_path}: {e}. Starting fresh.")
|
||||||
|
|
||||||
def _save_state(self):
|
def _save_state(self):
|
||||||
|
if self.disable_log_writing:
|
||||||
|
return
|
||||||
with self.lock:
|
with self.lock:
|
||||||
try:
|
try:
|
||||||
with open(self.state_file_path, 'w', encoding='utf-8') as f:
|
with open(self.state_file_path, 'w', encoding='utf-8') as f:
|
||||||
@ -207,6 +213,8 @@ class StateManager:
|
|||||||
logger.error(f"Could not save state to {self.state_file_path}: {e}")
|
logger.error(f"Could not save state to {self.state_file_path}: {e}")
|
||||||
|
|
||||||
def _open_stats_log(self):
|
def _open_stats_log(self):
|
||||||
|
if self.disable_log_writing:
|
||||||
|
return
|
||||||
try:
|
try:
|
||||||
self.stats_file_handle = open(self.stats_file_path, 'a', encoding='utf-8')
|
self.stats_file_handle = open(self.stats_file_path, 'a', encoding='utf-8')
|
||||||
except IOError as e:
|
except IOError as e:
|
||||||
@ -737,6 +745,18 @@ class StateManager:
|
|||||||
logger.info("Requests per proxy:")
|
logger.info("Requests per proxy:")
|
||||||
for proxy, count in sorted(proxy_counts.items()):
|
for proxy, count in sorted(proxy_counts.items()):
|
||||||
logger.info(f" - {proxy}: {count}")
|
logger.info(f" - {proxy}: {count}")
|
||||||
|
|
||||||
|
profile_counts = collections.Counter(e.get('profile') for e in fetch_events if e.get('profile'))
|
||||||
|
if profile_counts:
|
||||||
|
logger.info("Requests per profile:")
|
||||||
|
for profile, count in sorted(profile_counts.items()):
|
||||||
|
logger.info(f" - {profile}: {count}")
|
||||||
|
|
||||||
|
proxy_counts = collections.Counter(e.get('proxy_url') for e in fetch_events if e.get('proxy_url'))
|
||||||
|
if proxy_counts:
|
||||||
|
logger.info("Requests per proxy:")
|
||||||
|
for proxy, count in sorted(proxy_counts.items()):
|
||||||
|
logger.info(f" - {proxy}: {count}")
|
||||||
|
|
||||||
if download_events:
|
if download_events:
|
||||||
total_attempts = len(download_events)
|
total_attempts = len(download_events)
|
||||||
@ -1104,9 +1124,11 @@ def run_download_worker(info_json_path, info_json_content, format_to_download, p
|
|||||||
if proxy_rename:
|
if proxy_rename:
|
||||||
download_cmd.extend(['--proxy-rename', str(proxy_rename)])
|
download_cmd.extend(['--proxy-rename', str(proxy_rename)])
|
||||||
|
|
||||||
|
# The 'extra_args' from the policy are for the download script itself, not for yt-dlp.
|
||||||
|
# We need to split them and add them to the command.
|
||||||
extra_args = download_policy.get('extra_args')
|
extra_args = download_policy.get('extra_args')
|
||||||
if extra_args:
|
if extra_args:
|
||||||
download_cmd.extend(['--extra-ytdlp-args', str(extra_args)])
|
download_cmd.extend(shlex.split(extra_args))
|
||||||
|
|
||||||
# Pass through downloader settings for yt-dlp to use
|
# Pass through downloader settings for yt-dlp to use
|
||||||
# e.g. to tell yt-dlp to use aria2c as its backend
|
# e.g. to tell yt-dlp to use aria2c as its backend
|
||||||
@ -1227,6 +1249,11 @@ def process_info_json_cycle(path, content, policy, state_manager, proxy_url=None
|
|||||||
requested_formats = [f.strip() for f in format_selection.split(',') if f.strip()]
|
requested_formats = [f.strip() for f in format_selection.split(',') if f.strip()]
|
||||||
formats_to_test = []
|
formats_to_test = []
|
||||||
for req_fmt in requested_formats:
|
for req_fmt in requested_formats:
|
||||||
|
# If it's a complex selector with slashes, don't try to validate it against available formats.
|
||||||
|
if '/' in req_fmt:
|
||||||
|
formats_to_test.append(req_fmt)
|
||||||
|
continue
|
||||||
|
|
||||||
# Check for exact match first
|
# Check for exact match first
|
||||||
if req_fmt in available_formats:
|
if req_fmt in available_formats:
|
||||||
formats_to_test.append(req_fmt)
|
formats_to_test.append(req_fmt)
|
||||||
@ -1661,6 +1688,7 @@ Overridable Policy Parameters via --set:
|
|||||||
|
|
||||||
parser.add_argument('--verbose', action='store_true', help='Enable verbose output for the orchestrator and underlying scripts.')
|
parser.add_argument('--verbose', action='store_true', help='Enable verbose output for the orchestrator and underlying scripts.')
|
||||||
parser.add_argument('--dry-run', action='store_true', help='Print the effective policy and exit without running the test.')
|
parser.add_argument('--dry-run', action='store_true', help='Print the effective policy and exit without running the test.')
|
||||||
|
parser.add_argument('--disable-log-writing', action='store_true', help='Disable writing state, stats, and log files. By default, files are created for each run.')
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@ -1761,11 +1789,6 @@ def main_stress_policy(args):
|
|||||||
print_policy_overrides(policy)
|
print_policy_overrides(policy)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
log_level = logging.DEBUG if args.verbose else logging.INFO
|
|
||||||
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' if args.verbose else '%(asctime)s - %(message)s'
|
|
||||||
date_format = None if args.verbose else '%H:%M:%S'
|
|
||||||
logging.basicConfig(level=log_level, format=log_format, datefmt=date_format, stream=sys.stdout)
|
|
||||||
|
|
||||||
policy = load_policy(args.policy, args.policy_name)
|
policy = load_policy(args.policy, args.policy_name)
|
||||||
policy = apply_overrides(policy, args.set)
|
policy = apply_overrides(policy, args.set)
|
||||||
|
|
||||||
@ -1782,8 +1805,37 @@ def main_stress_policy(args):
|
|||||||
policy.setdefault('download_policy', {})['cleanup'] = args.cleanup
|
policy.setdefault('download_policy', {})['cleanup'] = args.cleanup
|
||||||
|
|
||||||
policy_name = policy.get('name', args.policy_name or Path(args.policy).stem)
|
policy_name = policy.get('name', args.policy_name or Path(args.policy).stem)
|
||||||
|
|
||||||
|
# --- Logging Setup ---
|
||||||
|
log_level = logging.DEBUG if args.verbose else logging.INFO
|
||||||
|
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' if args.verbose else '%(asctime)s - %(message)s'
|
||||||
|
date_format = None if args.verbose else '%H:%M:%S'
|
||||||
|
|
||||||
state_manager = StateManager(policy_name)
|
root_logger = logging.getLogger()
|
||||||
|
root_logger.setLevel(log_level)
|
||||||
|
|
||||||
|
# Remove any existing handlers to avoid duplicate logs
|
||||||
|
for handler in root_logger.handlers[:]:
|
||||||
|
root_logger.removeHandler(handler)
|
||||||
|
|
||||||
|
# Add console handler
|
||||||
|
console_handler = logging.StreamHandler(sys.stdout)
|
||||||
|
console_handler.setFormatter(logging.Formatter(log_format, datefmt=date_format))
|
||||||
|
root_logger.addHandler(console_handler)
|
||||||
|
|
||||||
|
if not args.disable_log_writing:
|
||||||
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||||
|
log_filename = f"stress-policy-{timestamp}-{policy_name}.log"
|
||||||
|
try:
|
||||||
|
file_handler = logging.FileHandler(log_filename, encoding='utf-8')
|
||||||
|
file_handler.setFormatter(logging.Formatter(log_format, datefmt=date_format))
|
||||||
|
root_logger.addHandler(file_handler)
|
||||||
|
# Use print because logger is just being set up.
|
||||||
|
print(f"Logging to file: {log_filename}", file=sys.stderr)
|
||||||
|
except IOError as e:
|
||||||
|
print(f"Error: Could not open log file {log_filename}: {e}", file=sys.stderr)
|
||||||
|
|
||||||
|
state_manager = StateManager(policy_name, disable_log_writing=args.disable_log_writing)
|
||||||
|
|
||||||
# --- Graceful shutdown handler ---
|
# --- Graceful shutdown handler ---
|
||||||
def shutdown_handler(signum, frame):
|
def shutdown_handler(signum, frame):
|
||||||
@ -1881,26 +1933,20 @@ def main_stress_policy(args):
|
|||||||
logger.error("No sources (URLs or info.json files) to process. Exiting.")
|
logger.error("No sources (URLs or info.json files) to process. Exiting.")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
# --- Group sources by profile if in download_only mode with regex ---
|
# Grouping of sources by profile is now handled inside the main loop to support continuous mode.
|
||||||
profile_tasks = None
|
|
||||||
task_items = sources # Default to list of sources
|
|
||||||
profile_extraction_regex = settings.get('profile_extraction_regex')
|
profile_extraction_regex = settings.get('profile_extraction_regex')
|
||||||
|
|
||||||
|
# For 'auto' worker calculation and initial display, we need to group sources once.
|
||||||
|
# This will be re-calculated inside the loop for continuous mode.
|
||||||
|
profile_tasks = None
|
||||||
if mode == 'download_only' and profile_extraction_regex:
|
if mode == 'download_only' and profile_extraction_regex:
|
||||||
logger.info(f"Grouping info.json files by profile using regex: {profile_extraction_regex}")
|
|
||||||
profile_tasks = collections.defaultdict(list)
|
profile_tasks = collections.defaultdict(list)
|
||||||
for source_path in sources:
|
for source_path in sources:
|
||||||
profile_name = get_profile_from_filename(source_path, profile_extraction_regex)
|
profile_name = get_profile_from_filename(source_path, profile_extraction_regex)
|
||||||
if profile_name:
|
if profile_name:
|
||||||
profile_tasks[profile_name].append(source_path)
|
profile_tasks[profile_name].append(source_path)
|
||||||
else:
|
else:
|
||||||
# Assign to a default profile if no match
|
|
||||||
profile_tasks['unmatched_profile'].append(source_path)
|
profile_tasks['unmatched_profile'].append(source_path)
|
||||||
|
|
||||||
num_profiles = len(profile_tasks)
|
|
||||||
logger.info(f"Found {num_profiles} unique profiles. Tasks will be processed sequentially per profile.")
|
|
||||||
# The new "sources" for the purpose of task distribution are the profiles.
|
|
||||||
task_items = list(profile_tasks.items())
|
|
||||||
|
|
||||||
# --- Auto-calculate workers if needed ---
|
# --- Auto-calculate workers if needed ---
|
||||||
exec_control = policy.get('execution_control', {})
|
exec_control = policy.get('execution_control', {})
|
||||||
@ -1977,12 +2023,12 @@ def main_stress_policy(args):
|
|||||||
|
|
||||||
# --- Step 1: Get info.json content ---
|
# --- Step 1: Get info.json content ---
|
||||||
info_json_content = None
|
info_json_content = None
|
||||||
|
profile_name = None
|
||||||
if mode in ['full_stack', 'fetch_only']:
|
if mode in ['full_stack', 'fetch_only']:
|
||||||
gen_policy = policy.get('info_json_generation_policy', {})
|
gen_policy = policy.get('info_json_generation_policy', {})
|
||||||
cmd_template = gen_policy.get('command_template')
|
cmd_template = gen_policy.get('command_template')
|
||||||
|
|
||||||
# --- Profile Generation ---
|
# --- Profile Generation ---
|
||||||
profile_name = None
|
|
||||||
profile_mode = settings.get('profile_mode')
|
profile_mode = settings.get('profile_mode')
|
||||||
pm_policy = settings.get('profile_management')
|
pm_policy = settings.get('profile_management')
|
||||||
|
|
||||||
@ -2303,6 +2349,28 @@ def main_stress_policy(args):
|
|||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# --- Group sources for this cycle ---
|
||||||
|
task_items = sources
|
||||||
|
profile_tasks = None
|
||||||
|
if mode == 'download_only' and profile_extraction_regex:
|
||||||
|
profile_tasks = collections.defaultdict(list)
|
||||||
|
for source_path in sources:
|
||||||
|
profile_name = get_profile_from_filename(source_path, profile_extraction_regex)
|
||||||
|
if profile_name:
|
||||||
|
profile_tasks[profile_name].append(source_path)
|
||||||
|
else:
|
||||||
|
profile_tasks['unmatched_profile'].append(source_path)
|
||||||
|
task_items = list(profile_tasks.items())
|
||||||
|
|
||||||
|
# If there's nothing to do this cycle, skip.
|
||||||
|
if not task_items:
|
||||||
|
if mode == 'download_only' and settings.get('directory_scan_mode') == 'continuous':
|
||||||
|
# The sleep logic is handled inside the rescanning block.
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
logger.info("No more sources to process. Ending test.")
|
||||||
|
break
|
||||||
|
|
||||||
cycles += 1
|
cycles += 1
|
||||||
if max_cycles > 0 and cycles > max_cycles:
|
if max_cycles > 0 and cycles > max_cycles:
|
||||||
logger.info(f"Reached max cycles ({max_cycles}). Stopping.")
|
logger.info(f"Reached max cycles ({max_cycles}). Stopping.")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user