Recopy after ansible move

This commit is contained in:
aperez 2025-08-26 18:00:55 +03:00
parent 182deac14e
commit b3f8597e81
314 changed files with 63895 additions and 848 deletions

180
.gitignore vendored
View File

@ -1,180 +0,0 @@
.DS_Store
*.csv
results/
tmp/
context_data/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
node_modules/
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib64/
lib/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
.aider*

1
.vault_pass Normal file
View File

@ -0,0 +1 @@
ytdlp-ops

205
LICENSE
View File

@ -1,205 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [2025] [Pangramia Limited]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-----------------------------------------------------------------------

View File

@ -1,38 +0,0 @@
# Архитектура и описание YTDLP Airflow DAGs
Этот документ описывает архитектуру и назначение DAG'ов, используемых для скачивания видео с YouTube. Система построена по паттерну "Сенсор/Воркер" для обеспечения непрерывной и параллельной обработки.
## Основной цикл обработки
### `ytdlp_sensor_redis_queue` (Сенсор)
- **Назначение:** Забирает URL на скачивание из очереди Redis и запускает воркеры для их обработки.
- **Принцип работы (Гибридный запуск):**
- **По расписанию:** Каждую минуту DAG автоматически проверяет очередь Redis. Это гарантирует, что новые задачи будут подхвачены, даже если цикл обработки был временно остановлен (из-за пустой очереди).
- **По триггеру:** Когда воркер `ytdlp_worker_per_url` успешно завершает работу, он немедленно запускает сенсор, не дожидаясь следующей минуты. Это обеспечивает непрерывную обработку без задержек.
- **Логика:** Извлекает из Redis (`_inbox` лист) пачку URL. Если очередь пуста, DAG успешно завершается до следующего запуска (по триггеру или по расписанию).
### `ytdlp_worker_per_url` (Воркер)
- **Назначение:** Обрабатывает один URL, скачивает видео и продолжает цикл.
- **Принцип работы:**
- Получает один URL от сенсора.
- Обращается к сервису `ytdlp-ops-auth` для получения `info.json` и `socks5` прокси.
- Скачивает видео, используя полученные данные. (TODO: заменить вызов `yt-dlp` как команды на вызов библиотеки).
- В зависимости от статуса (успех/неуспех), помещает результат в соответствующий хэш Redis (`_result` или `_fail`).
- В случае успеха, повторно запускает сенсор `ytdlp_sensor_redis_queue` для продолжения цикла обработки. В случае ошибки цикл останавливается для ручной диагностики.
## Управляющие DAG'и
Эти DAG'и предназначены для ручного управления очередями и не участвуют в автоматическом цикле.
- **`ytdlp_mgmt_queue_add_and_verify`**: Добавление URL в очередь задач (`_inbox`) и последующая проверка статуса этой очереди.
- **`ytdlp_mgmt_queues_check_status`**: Просмотр состояния и содержимого всех ключевых очередей (`_inbox`, `_progress`, `_result`, `_fail`). Помогает отслеживать процесс обработки.
- **`ytdlp_mgmt_queue_clear`**: Очистка (полное удаление) указанной очереди Redis. **Использовать с осторожностью**, так как операция необратима.
## Внешние сервисы
### `ytdlp-ops-auth` (Thrift Service)
- **Назначение:** Внешний сервис, который предоставляет аутентификационные данные (токены, cookies, proxy) для скачивания видео.
- **Взаимодействие:** Worker DAG (`ytdlp_worker_per_url`) обращается к этому сервису перед началом загрузки для получения необходимых данных для `yt-dlp`.

View File

@ -1 +1 @@
2.2.0 3.6.0

View File

@ -1 +0,0 @@
1.6.2-SNAPSHOT

2
airflow/.dockerignore Normal file
View File

@ -0,0 +1,2 @@
redis-data
minio-data

23
airflow/.env Normal file
View File

@ -0,0 +1,23 @@
AIRFLOW_IMAGE_NAME=apache/airflow:2.10.4
_AIRFLOW_WWW_USER_USERNAME=airflow
_AIRFLOW_WWW_USER_PASSWORD=airflow-password-ytld
AIRFLOW_UID=50000
AIRFLOW_PROJ_DIR=.
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow-new-super-pass@89.253.221.173:52919/airflow
AIRFLOW__CELERY__RESULT_BACKEND=db+postgresql://airflow:airflow-new-super-pass@89.253.221.173:52919/airflow
AIRFLOW__CELERY__BROKER_URL=redis://:rOhTAIlTFFylXsjhqwxnYxDChFc@89.253.221.173:52909/0
AIRFLOW_QUEUE=holisticlegs-download
AIRFLOW_QUEUE_CHECK=holisticlegs-check
AIRFLOW_QUEUE_UPLOAD=holisticlegs-upload
AIRFLOW__WEBSERVER__SECRET_KEY=8DJ6XbtIICassrVxM9jWV3eTlt5N3XtyEdyW
HOSTNAME=85.192.30.55
AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT=768M
AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV=522M
AIRFLOW_WORKER_DOWNLOAD_CONCURRENCY=2
AIRFLOW_SMALL_WORKERS_MEM_LIMIT=1024M
AIRFLOW_SMALL_WORKERS_MEM_RESERV=512M
~

60
airflow/.env.example Normal file
View File

@ -0,0 +1,60 @@
# This file contains all environment variables for the Airflow-based deployment.
# Copy this file to .env in the same directory and fill in your production values.
# This file is used by `generate_envoy_config.py` and `docker-compose-ytdlp-ops.yaml`.
# --- Common Configuration ---
# A unique name for this server instance, used as a key in Redis.
# This is hardcoded in the docker-compose file but can be overridden here.
SERVER_IDENTITY=ytdlp-ops-airflow-service
# Redis connection details for proxy and account state management.
REDIS_HOST=redis
REDIS_PORT=6379
REDIS_PASSWORD=redis_pwd_K3fG8hJ1mN5pQ2sT
# --- Airflow Database Configuration ---
# The password for the PostgreSQL database used by Airflow.
# This should be a secure, randomly generated password.
POSTGRES_PASSWORD=pgdb_pwd_A7bC2xY9zE1wV5uP
# The password for the Airflow web UI admin user.
AIRFLOW_ADMIN_PASSWORD=admin_pwd_X9yZ3aB1cE5dF7gH
# --- Envoy & Worker Configuration ---
# The public-facing port for the Envoy load balancer that fronts the WORKERS.
ENVOY_PORT=9080
# The port for Envoy's admin/stats interface.
ENVOY_ADMIN_PORT=9901
# The public-facing port for the standalone MANAGEMENT service.
MANAGEMENT_SERVICE_PORT=9091
# The number of Python server workers to run.
# Set to 1 to simplify debugging. Multi-worker mode is experimental.
YTDLP_WORKERS=1
# The starting port for the Python workers. They will use sequential ports (e.g., 9090, 9091, ...).
YTDLP_BASE_PORT=9090
# --- Camoufox (Browser) Configuration ---
# Comma-separated list of SOCKS5 proxies to be used by Camoufox instances.
# Each proxy will get its own dedicated browser instance.
# Example: CAMOUFOX_PROXIES="socks5://user:pass@p.webshare.io:80,socks5://user:pass@p.webshare.io:81"
CAMOUFOX_PROXIES="socks5://your_proxy_user:your_proxy_pass@proxy.example.com:1080,socks5://your_proxy_user:your_proxy_pass@proxy.example.com:1081"
# Password for VNC access to the Camoufox browser instances.
VNC_PASSWORD=vnc_pwd_Z5xW8cV2bN4mP7lK
# The starting port for VNC access. Ports will be assigned sequentially (e.g., 5901, 5902, ...).
CAMOUFOX_BASE_VNC_PORT=5901
# The internal port used by Camoufox for its WebSocket server. Usually does not need to be changed.
CAMOUFOX_PORT=12345
# --- General Proxy Configuration ---
# A general-purpose SOCKS5 proxy that can be used alongside Camoufox proxies.
# This should be the IP address of the proxy server accessible from within the Docker network.
# '172.17.0.1' is often the host IP from within a container.
SOCKS5_SOCK_SERVER_IP=172.17.0.1
# --- Account Manager Configuration ---
# Account cooldown parameters (values are in minutes).
ACCOUNT_ACTIVE_DURATION_MIN=30
ACCOUNT_COOLDOWN_DURATION_MIN=60

11
airflow/.env.master Normal file
View File

@ -0,0 +1,11 @@
HOSTNAME="af-green"
REDIS_PASSWORD="rOhTAIlTFFylXsjhqwxnYxDChFc"
POSTGRES_PASSWORD="pgdb_pwd_A7bC2xY9zE1wV5uP"
AIRFLOW_UID=1003
AIRFLOW_ADMIN_PASSWORD="2r234sdfrt3q454arq45q355"
YTDLP_BASE_PORT=9090
SERVER_IDENTITY="ytdlp-ops-service-mgmt"
SERVICE_ROLE=management
AIRFLOW_GID=0
MINIO_ROOT_USER=admin
MINIO_ROOT_PASSWORD=0153093693-0009

23
airflow/.env.old Normal file
View File

@ -0,0 +1,23 @@
AIRFLOW_IMAGE_NAME=apache/airflow:2.10.4
_AIRFLOW_WWW_USER_USERNAME=airflow
_AIRFLOW_WWW_USER_PASSWORD=airflow-password-ytld
AIRFLOW_UID=50000
AIRFLOW_PROJ_DIR=.
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow-new-super-pass@89.253.221.173:52919/airflow
AIRFLOW__CELERY__RESULT_BACKEND=db+postgresql://airflow:airflow-new-super-pass@89.253.221.173:52919/airflow
AIRFLOW__CELERY__BROKER_URL=redis://:rOhTAIlTFFylXsjhqwxnYxDChFc@89.253.221.173:52909/0
AIRFLOW_QUEUE=holisticlegs-download
AIRFLOW_QUEUE_CHECK=holisticlegs-check
AIRFLOW_QUEUE_UPLOAD=holisticlegs-upload
AIRFLOW__WEBSERVER__SECRET_KEY=8DJ6XbtIICassrVxM9jWV3eTlt5N3XtyEdyW
HOSTNAME=85.192.30.55
AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT=768M
AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV=522M
AIRFLOW_WORKER_DOWNLOAD_CONCURRENCY=2
AIRFLOW_SMALL_WORKERS_MEM_LIMIT=1024M
AIRFLOW_SMALL_WORKERS_MEM_RESERV=512M
~

71
airflow/.env.worker Normal file
View File

@ -0,0 +1,71 @@
HOSTNAME="dl001"
MASTER_HOST_IP=89.253.221.173
REDIS_PASSWORD="rOhTAIlTFFylXsjhqwxnYxDChFc"
POSTGRES_PASSWORD="pgdb_pwd_A7bC2xY9zE1wV5uP"
AIRFLOW_UID=1003
REDIS_HOST=89.253.221.173
REDIS_PORT=52909
SERVER_IDENTITY=ytdlp-ops-service-worker-dl001
# The role of the ytdlp-ops-server instance.
# 'management': Runs only state management tasks (proxy/account status). Use for the master deployment.
# 'worker' or 'all-in-one': Runs token generation tasks. Use for dedicated worker deployments.
SERVICE_ROLE=worker
# --- Envoy & Worker Configuration ---
# The public-facing port for the Envoy load balancer that fronts the WORKERS.
ENVOY_PORT=9080
# The port for Envoy's admin/stats interface.
ENVOY_ADMIN_PORT=9901
# The public-facing port for the standalone MANAGEMENT service.
MANAGEMENT_SERVICE_PORT=9091
# The number of Python server workers to run.
# Set to 1 to simplify debugging. Multi-worker mode is experimental.
YTDLP_WORKERS=1
# The starting port for the Python workers. They will use sequential ports (e.g., 9090, 9091, ...).
YTDLP_BASE_PORT=9090
# --- Camoufox (Browser) Configuration ---
# Comma-separated list of SOCKS5 proxies to be used by Camoufox instances.
# Each proxy will get its own dedicated browser instance (1:1 mapping).
# Example: CAMOUFOX_PROXIES="socks5://user:pass@p.webshare.io:1081,socks5://user:pass@p.webshare.io:1082"
CAMOUFOX_PROXIES="socks5://sslocal-rust-1087:1087"
# Password for VNC access to the Camoufox browser instances.
VNC_PASSWORD="vnc_pwd_Z5xW8cV2bN4mP7lK"
# The starting port for VNC access. Ports will be assigned sequentially (e.g., 5901, 5902, ...).
CAMOUFOX_BASE_VNC_PORT=5901
# The internal port used by Camoufox for its WebSocket server. Usually does not need to be changed.
CAMOUFOX_PORT=12345
# Legacy mode: Use single camoufox instance for all proxies
# CAMOUFOX_LEGACY_MODE=false
# Resource monitoring configuration
CAMOUFOX_MAX_MEMORY_MB=2048
CAMOUFOX_MAX_CPU_PERCENT=80
CAMOUFOX_MAX_CONCURRENT_CONTEXTS=8
CAMOUFOX_HEALTH_CHECK_INTERVAL=30
# Mapping configuration (proxy port → camoufox instance)
# socks5://proxy:1081 → camoufox-1:12345
# socks5://proxy:1082 → camoufox-2:12345
# socks5://proxy:1083 → camoufox-3:12345
# socks5://proxy:1084 → camoufox-4:12345
# --- General Proxy Configuration ---
# A general-purpose SOCKS5 proxy that can be used alongside Camoufox proxies.
# This should be the IP address of the proxy server accessible from within the Docker network.
# '172.17.0.1' is often the host IP from within a container.
SOCKS5_SOCK_SERVER_IP=172.17.0.1
# --- Account Manager Configuration ---
# Account cooldown parameters (values are in minutes).
ACCOUNT_ACTIVE_DURATION_MIN=7
ACCOUNT_COOLDOWN_DURATION_MIN=30
MINIO_ROOT_USER=admin
MINIO_ROOT_PASSWORD=0153093693-0009
AIRFLOW_GID=0

View File

@ -18,6 +18,10 @@ RUN apt-get update && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /usr/share/man /usr/share/doc /usr/share/doc-base rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /usr/share/man /usr/share/doc /usr/share/doc-base
# Download and install mc (MinIO client)
RUN wget https://dl.min.io/client/mc/release/linux-amd64/mc -O /usr/local/bin/mc && \
chmod +x /usr/local/bin/mc
# Download and install custom FFmpeg build from yt-dlp's recommended source # Download and install custom FFmpeg build from yt-dlp's recommended source
RUN FFMPEG_URL="https://github.com/yt-dlp/FFmpeg-Builds/releases/download/latest/ffmpeg-master-latest-linux64-gpl.tar.xz" && \ RUN FFMPEG_URL="https://github.com/yt-dlp/FFmpeg-Builds/releases/download/latest/ffmpeg-master-latest-linux64-gpl.tar.xz" && \
echo "Downloading FFmpeg from $FFMPEG_URL" && \ echo "Downloading FFmpeg from $FFMPEG_URL" && \
@ -47,7 +51,6 @@ RUN pip install --no-cache-dir \
COPY --chown=airflow:airflow setup.py ./ COPY --chown=airflow:airflow setup.py ./
COPY --chown=airflow:airflow VERSION ./ COPY --chown=airflow:airflow VERSION ./
COPY --chown=airflow:airflow yt_ops_services ./yt_ops_services/ COPY --chown=airflow:airflow yt_ops_services ./yt_ops_services/
COPY --chown=airflow:airflow server_fix ./server_fix/
COPY --chown=airflow:airflow thrift_model ./thrift_model/ COPY --chown=airflow:airflow thrift_model ./thrift_model/
COPY --chown=airflow:airflow pangramia ./pangramia/ COPY --chown=airflow:airflow pangramia ./pangramia/
@ -56,6 +59,14 @@ COPY --chown=airflow:airflow pangramia ./pangramia/
RUN pip install --no-cache-dir -e . RUN pip install --no-cache-dir -e .
# Copy token generator scripts and utils with correct permissions # Copy token generator scripts and utils with correct permissions
COPY --chown=airflow:airflow generate_tokens_direct.mjs ./ # COPY --chown=airflow:airflow generate_tokens_direct.mjs ./
COPY --chown=airflow:airflow utils ./utils/ # COPY --chown=airflow:airflow utils ./utils/
COPY --chown=airflow:airflow token_generator ./token_generator/ # COPY --chown=airflow:airflow token_generator ./token_generator/
# --- Always update yt-dlp to latest nightly on container start ---
# This is done in the entrypoint so every worker run uses the freshest build
COPY --chown=airflow:airflow update-yt-dlp.sh /usr/local/bin/update-yt-dlp.sh
RUN chmod +x /usr/local/bin/update-yt-dlp.sh
# Expose bgutil plugin to worker path
ENV PYTHONPATH=/opt/bgutil-ytdlp-pot-provider/plugin:$PYTHONPATH

19
airflow/Dockerfile.thrift Normal file
View File

@ -0,0 +1,19 @@
Dockerfile.thrift
FROM python:3.9-slim as builder
WORKDIR /app
#COPY ../setup.py /app/setup.py
#COPY ../requirements.txt /app/requirements.txt
#COPY ../yt_ops_services /app/yt_ops_services
#COPY ../thrift_model /app/thrift_model
#COPY ../server /app/server
COPY requirements.txt /app/requirements.txt
# Install dependencies
RUN pip install --user --no-cache-dir -r /app/requirements.txt
# Install the custom package in editable mode
#RUN pip install --user -e /app

249
airflow/README-proxy.RU.md Normal file
View File

@ -0,0 +1,249 @@
# Стратегия Управления Прокси и Аккаунтами
В этом документе описывается интеллектуальная стратегия управления ресурсами (прокси и аккаунтами), используемая в `ytdlp-ops-server`. Цель этой системы — максимизировать процент успешных операций, минимизировать блокировки и обеспечить отказоустойчивость.
Сервер может работать в разных ролях для поддержки распределенной архитектуры, разделяя задачи управления и задачи генерации токенов.
---
## Роли Сервиса и Архитектура
Сервер предназначен для работы в одной из трех ролей, указываемых флагом `--service-role`:
- **`management`**: Один легковесный экземпляр сервиса, отвечающий за все вызовы API управления.
- **Назначение**: Предоставляет централизованную точку входа для мониторинга и управления состоянием всех прокси и аккаунтов в системе.
- **Поведение**: Предоставляет только функции управления (`getProxyStatus`, `banAccount` и т.д.). Вызовы функций генерации токенов будут завершаться ошибкой.
- **Развертывание**: Запускается как один контейнер (`ytdlp-ops-management`) и напрямую открывает свой порт на хост (например, порт `9091`), минуя Envoy.
- **`worker`**: Основная "рабочая лошадка" для генерации токенов и `info.json`.
- **Назначение**: Обрабатывает все запросы на генерацию токенов.
- **Поведение**: Реализует полный API, но его функции управления ограничены его собственным `server_identity`.
- **Развертывание**: Запускается как масштабируемый сервис (`ytdlp-ops-worker`) за балансировщиком нагрузки Envoy (например, порт `9080`).
- **`all-in-one`** (По умолчанию): Один экземпляр, который выполняет как управленческие, так и рабочие функции. Идеально подходит для локальной разработки или небольших развертываний.
Эта архитектура позволяет создать надежную, федеративную систему, где воркеры управляют своими ресурсами локально, в то время как центральный сервис предоставляет глобальное представление для управления и мониторинга.
---
## 1. Управление Жизненным Циклом Аккаунтов (Cooldown / Resting)
**Цель:** Предотвратить чрезмерное использование и последующую блокировку аккаунтов, предоставляя им периоды "отдыха" после интенсивной работы.
### Как это работает:
Жизненный цикл аккаунта состоит из трех состояний:
- **`ACTIVE`**: Аккаунт активен и используется для выполнения задач. При первом успешном использовании запускается таймер его активности.
- **`RESTING`**: Если аккаунт был в состоянии `ACTIVE` дольше установленного лимита, `AccountManager` автоматически переводит его в состояние "отдыха". В этом состоянии Airflow worker не будет выбирать его для новых задач.
- **Возврат в `ACTIVE`**: После завершения периода "отдыха" `AccountManager` автоматически возвращает аккаунт в состояние `ACTIVE`, делая его снова доступным.
### Конфигурация:
Эти параметры настраиваются при запуске `ytdlp-ops-server`.
- `--account-active-duration-min`: "Время работы" в **минутах**, которое аккаунт может быть непрерывно активным до перехода в `RESTING`.
- **Значение по умолчанию:** `30` (минут).
- `--account-cooldown-duration-min`: "Время отдыха" в **минутах**, которое аккаунт должен находиться в состоянии `RESTING`.
- **Значение по умолчанию:** `60` (минут).
**Где настраивать:**
Параметры передаются как аргументы командной строки при запуске сервера. При использовании Docker Compose это делается в файле `airflow/docker-compose-ytdlp-ops.yaml`:
```yaml
command:
# ... другие параметры
- "--account-active-duration-min"
- "${ACCOUNT_ACTIVE_DURATION_MIN:-30}"
- "--account-cooldown-duration-min"
- "${ACCOUNT_COOLDOWN_DURATION_MIN:-60}"
```
Вы можете изменить значения по умолчанию, установив переменные окружения `ACCOUNT_ACTIVE_DURATION_MIN` и `ACCOUNT_COOLDOWN_DURATION_MIN` в вашем `.env` файле.
**Соответствующие файлы:**
- `server_fix/account_manager.py`: Содержит основную логику для переключения состояний.
- `ytdlp_ops_server_fix.py`: Обрабатывает аргументы командной строки.
- `airflow/docker-compose-ytdlp-ops.yaml`: Передает аргументы в контейнер сервера.
---
## 2. Умная Стратегия Банов
**Цель:** Избежать необоснованных банов хороших прокси. Проблема часто может быть в аккаунте, а не в прокси, через который он работает.
### Как это работает:
#### Этап 1: Сначала Бан Аккаунта
- При возникновении серьезной ошибки, требующей бана (например, `BOT_DETECTED` или `SOCKS5_CONNECTION_FAILED`), система применяет санкции **только к аккаунту**, который вызвал ошибку.
- Для прокси эта ошибка просто фиксируется как один сбой, но сам прокси **не банится** и остается в работе.
#### Этап 2: Бан Прокси по "Скользящему Окну"
- Прокси блокируется автоматически, только если он демонстрирует **систематические сбои с РАЗНЫМИ аккаунтами** за короткий промежуток времени.
- Это является надежным индикатором того, что проблема именно в прокси. `ProxyManager` на сервере отслеживает это и автоматически банит такой прокси.
### Конфигурация:
Эти параметры **жестко заданы** как константы в коде и для их изменения требуется редактирование файла.
**Где настраивать:**
- **Файл:** `server_fix/proxy_manager.py`
- **Константы** в классе `ProxyManager`:
- `FAILURE_WINDOW_SECONDS`: Временное окно в секундах для анализа сбоев.
- **Значение по умолчанию:** `3600` (1 час).
- `FAILURE_THRESHOLD_COUNT`: Минимальное общее количество сбоев для запуска проверки.
- **Значение по умолчанию:** `3`.
- `FAILURE_THRESHOLD_UNIQUE_ACCOUNTS`: Минимальное количество **уникальных аккаунтов**, с которыми произошли сбои, чтобы забанить прокси.
- **Значение по умолчанию:** `3`.
**Соответствующие файлы:**
- `server_fix/proxy_manager.py`: Содержит логику "скользящего окна" и константы.
- `airflow/dags/ytdlp_ops_worker_per_url.py`: Функция `handle_bannable_error_callable` реализует политику бана "только аккаунт".
---
### Расшифровка Статусов Аккаунтов
Вы можете просмотреть статус всех аккаунтов с помощью DAG `ytdlp_mgmt_proxy_account`. Статусы имеют следующие значения:
- **`ACTIVE`**: Аккаунт исправен и доступен для использования. По умолчанию, аккаунт считается `ACTIVE`, если у него не установлен конкретный статус.
- **`BANNED`**: Аккаунт временно отключен из-за повторяющихся сбоев (например, ошибок `BOT_DETECTED`) или забанен вручную. В статусе будет указано время, оставшееся до его автоматического возвращения в `ACTIVE` (например, `BANNED (active in 55m)`).
- **`RESTING`**: Аккаунт использовался в течение длительного времени и находится в обязательном периоде "отдыха" для предотвращения "выгорания". В статусе будет указано время, оставшееся до его возвращения в `ACTIVE` (например, `RESTING (active in 25m)`).
- **(Пустой Статус)**: В более старых версиях аккаунт, у которого были только сбои (и ни одного успеха), мог отображаться с пустым статусом. Это было исправлено; теперь такие аккаунты корректно отображаются как `ACTIVE`.
---
## 3. Сквозной Процесс Ротации: Как Всё Работает Вместе
Этот раздел описывает пошаговый процесс того, как воркер получает аккаунт и прокси для одной задачи, объединяя все вышеописанные стратегии управления.
1. **Инициализация Воркера (`ytdlp_ops_worker_per_url`)**
- Запускается DAG, инициированный либо оркестратором, либо предыдущим успешным запуском самого себя.
- Задача `pull_url_from_redis` извлекает URL из очереди `_inbox` в Redis.
2. **Выбор Аккаунта (Воркер Airflow)**
- Выполняется задача `assign_account`.
- Она генерирует полный список потенциальных ID аккаунтов на основе параметра `account_pool` (например, от `my_prefix_01` до `my_prefix_50`).
- Она подключается к Redis и проверяет статус каждого аккаунта из этого списка.
- Она создает новый временный список, содержащий только те аккаунты, которые **не** находятся в состоянии `BANNED` или `RESTING`.
- Если итоговый список активных аккаунтов пуст, воркер завершается с ошибкой (если не включено автосоздание).
- Затем из отфильтрованного списка активных аккаунтов с помощью **`random.choice()`** выбирается один.
- Выбранный `account_id` передается следующей задаче.
3. **Выбор Прокси (`ytdlp-ops-server`)**
- Выполняется задача `get_token`, которая отправляет случайно выбранный `account_id` в Thrift RPC-вызове на `ytdlp-ops-server`.
- На сервере у `ProxyManager` запрашивается прокси.
- `ProxyManager`:
a. Обновляет свое внутреннее состояние, загружая статусы всех прокси из Redis.
b. Фильтрует список, оставляя только прокси со статусом `ACTIVE`.
c. Применяет политику бана по "скользящему окну", потенциально блокируя прокси, которые недавно слишком часто выходили из строя.
d. Выбирает следующий доступный прокси из активного списка, используя индекс **round-robin** (по кругу).
e. Возвращает выбранный `proxy_url`.
4. **Выполнение и Отчетность**
- Теперь у сервера есть и `account_id` (от Airflow), и `proxy_url` (от его `ProxyManager`).
- Он приступает к процессу генерации токенов, используя эти ресурсы.
- По завершении (успешном или неудачном) он сообщает о результате в Redis, обновляя статусы для конкретного аккаунта и прокси, которые были использованы. Это влияет на их счетчики сбоев, таймеры "отдыха" и т.д. для следующего запуска.
Это разделение ответственности является ключевым:
- **Воркер Airflow (задача `assign_account`)** отвечает за **случайный выбор активного аккаунта**, сохраняя при этом "привязку" (повторно используя тот же аккаунт после успеха).
- **Сервер `ytdlp-ops-server`** отвечает за **циклический выбор (round-robin) активного прокси**.
---
## 4. Автоматический Бан Аккаунтов по Количеству Сбоев
**Цель:** Автоматически выводить из ротации аккаунты, которые постоянно вызывают ошибки, не связанные с баном (например, неверный пароль, проблемы с авторизацией).
### Как это работает:
- `AccountManager` отслеживает количество **последовательных** сбоев для каждого аккаунта.
- При успешной операции счетчик сбрасывается.
- Если количество последовательных сбоев достигает заданного порога, аккаунт автоматически банится на определенный срок.
### Конфигурация:
Эти параметры задаются в конструкторе класса `AccountManager`.
**Где настраивать:**
- **Файл:** `server_fix/account_manager.py`
- **Параметры** в `__init__` метода `AccountManager`:
- `failure_threshold`: Количество последовательных сбоев до бана.
- **Значение по умолчанию:** `5`.
- `ban_duration_s`: Длительность бана в секундах.
- **Значение по умолчанию:** `3600` (1 час).
---
## 5. Мониторинг и Восстановление
### Как Проверить Статусы
DAG **`ytdlp_mgmt_proxy_account`** — это основной инструмент для мониторинга состояния ваших ресурсов. Он подключается напрямую к **сервису управления** для выполнения действий.
- **ID DAG'а:** `ytdlp_mgmt_proxy_account`
- **Как использовать:** Запустите DAG из интерфейса Airflow. Убедитесь, что параметры `management_host` и `management_port` правильно указывают на ваш экземпляр сервиса `ytdlp-ops-management`. Для получения полного обзора установите параметры:
- `entity`: `all`
- `action`: `list`
- **Результат:** В логе DAG'а будут отображены таблицы с текущим статусом всех аккаунтов и прокси. Для аккаунтов в состоянии `BANNED` или `RESTING` будет показано время, оставшееся до их активации (например, `RESTING (active in 45m)`). Для прокси будет подсвечено, какой из них является следующим `(next)` в ротации для конкретного воркера.
### Что Произойдет, если Все Аккаунты Будут Забанены или в "Отдыхе"?
Если весь пул аккаунтов станет недоступен (в статусе `BANNED` или `RESTING`), система по умолчанию приостановит работу.
- DAG `ytdlp_ops_worker_per_url` завершится с ошибкой `AirflowException` на шаге `assign_account`, так как пул активных аккаунтов будет пуст.
- Это остановит циклы обработки. Система будет находиться в состоянии паузы до тех пор, пока аккаунты не будут разбанены вручную или пока не истечет их таймер бана/отдыха. После этого вы сможете перезапустить циклы обработки с помощью DAG'а `ytdlp_ops_orchestrator`.
- Граф выполнения DAG `ytdlp_ops_worker_per_url` теперь явно показывает такие задачи, как `assign_account`, `get_token`, `ban_account`, `retry_get_token` и т.д., что делает поток выполнения и точки сбоя более наглядными.
Систему можно настроить на автоматическое создание новых аккаунтов, чтобы предотвратить полную остановку обработки.
#### Автоматическое Создание Аккаунтов при Исчерпании
- **Цель**: Обеспечить непрерывную работу конвейера обработки, даже если все аккаунты в основном пуле временно забанены или находятся в "отдыхе".
- **Как это работает**: Если параметр `auto_create_new_accounts_on_exhaustion` установлен в `True` и пул аккаунтов задан с помощью префикса (а не явного списка), система сгенерирует новый уникальный ID аккаунта, когда обнаружит, что активный пул пуст.
- **Именование новых аккаунтов**: Новые аккаунты создаются в формате `{prefix}-auto-{уникальный_id}`.
- **Конфигурация**:
- **Параметр**: `auto_create_new_accounts_on_exhaustion`
- **Где настраивать**: В конфигурации DAG `ytdlp_ops_orchestrator` при запуске.
- **Значение по умолчанию**: `True`.
---
## 6. Обработка Сбоев и Политика Повторных Попыток
**Цель:** Обеспечить гибкое управление поведением системы, когда воркер сталкивается с ошибкой, требующей бана (например, `BOT_DETECTED`).
### Как это работает
Когда задача `get_token` воркера завершается с ошибкой, требующей бана, поведение системы определяется политикой `on_bannable_failure`, которую можно настроить при запуске `ytdlp_ops_orchestrator`.
### Конфигурация
- **Параметр**: `on_bannable_failure`
- **Где настраивать**: В конфигурации DAG `ytdlp_ops_orchestrator`.
- **Опции**:
- `stop_loop` (Самая строгая):
- Использованный аккаунт банится.
- URL помечается как сбойный в хэше `_fail` в Redis.
- Цикл обработки воркера **останавливается**. "Линия" обработки становится неактивной.
- `retry_with_new_account` (По умолчанию, самая отказоустойчивая):
- Аккаунт, вызвавший сбой, банится.
- Воркер немедленно повторяет обработку **того же URL** с новым, неиспользованным аккаунтом из пула.
- Если повторная попытка успешна, воркер продолжает свой цикл для обработки следующего URL.
- Если повторная попытка также завершается сбоем, второй аккаунт **и использованный прокси** также банятся, и цикл работы воркера останавливается.
- `retry_and_ban_account_only`:
- Похожа на `retry_with_new_account`, но при втором сбое банится **только второй аккаунт**, а не прокси.
- Это полезно, когда вы доверяете своим прокси, но хотите агрессивно перебирать сбойные аккаунты.
- `retry_without_ban` (Самая мягкая):
- Воркер повторяет попытку с новым аккаунтом, но **ни аккаунты, ни прокси никогда не банятся**.
- Эта политика полезна для отладки или когда вы уверены, что сбои являются временными и не вызваны проблемами с ресурсами.
Эта политика позволяет системе быть устойчивой к сбоям отдельных аккаунтов, не теряя URL, и в то же время обеспечивает гранулярный контроль над тем, когда банить аккаунты и/или прокси, если проблема сохраняется.
---
## 7. Логика Работы Worker DAG (`ytdlp_ops_worker_per_url`)
Этот DAG является "рабочей лошадкой" системы. Он спроектирован как самоподдерживающийся цикл для обработки одного URL за запуск. Логика обработки сбоев и повторных попыток теперь явно видна в графе задач DAG.
### Задачи и их назначение:
- **`pull_url_from_redis`**: Извлекает один URL из очереди `_inbox` в Redis. Если очередь пуста, DAG завершается со статусом `skipped`, останавливая эту "линию" обработки.
- **`assign_account`**: Выбирает аккаунт для задачи. Он поддерживает **привязку аккаунта (affinity)**, повторно используя тот же аккаунт из предыдущего успешного запуска в своей "линии". Если это первый запуск или предыдущий был неудачным, он выбирает случайный активный аккаунт.
- **`get_token`**: Основная попытка получить токены и `info.json` путем вызова `ytdlp-ops-server`.
- **`handle_bannable_error_branch`**: Задача-развилка, которая запускается в случае сбоя `get_token`. Она анализирует ошибку и определяет следующий шаг на основе политики `on_bannable_failure`.
- **`ban_account_and_prepare_for_retry`**: Если разрешен повтор, эта задача банит сбойный аккаунт и выбирает новый.
- **`retry_get_token`**: Вторая попытка получить токен с использованием нового аккаунта.
- **`ban_second_account_and_proxy`**: Если и повторная попытка завершается неудачей, эта задача банит второй аккаунт и использованный прокси.
- **`download_and_probe`**: Если `get_token` или `retry_get_token` завершается успешно, эта задача использует `yt-dlp` для скачивания медиа и `ffmpeg` для проверки целостности файла.
- **`mark_url_as_success`**: Если `download_and_probe` завершается успешно, эта задача записывает успешный результат в хэш `_result` в Redis.
- **`handle_generic_failure`**: Если любая задача завершается с неисправимой ошибкой, эта задача записывает подробную информацию об ошибке в хэш `_fail` в Redis.
- **`decide_what_to_do_next`**: Финальная задача-развилка, которая решает, продолжать ли цикл (`trigger_self_run`), остановить его корректно (`stop_loop`) или пометить как сбойный (`fail_loop`).
- **`trigger_self_run`**: Задача, которая фактически запускает следующий экземпляр DAG, создавая непрерывный цикл.

322
airflow/README-proxy.md Normal file
View File

@ -0,0 +1,322 @@
# Proxy and Account Management Strategy
This document describes the intelligent resource management strategy (for proxies and accounts) used by the `ytdlp-ops-server`. The goal of this system is to maximize the success rate, minimize blocks, and ensure fault tolerance.
The server can run in different roles to support a distributed architecture, separating management tasks from token generation work.
---
## Service Roles and Architecture
The server is designed to run in one of three roles, specified by the `--service-role` flag:
- **`management`**: A single, lightweight service instance responsible for all management API calls.
- **Purpose**: Provides a centralized endpoint for monitoring and managing the state of all proxies and accounts across the system.
- **Behavior**: Exposes only management functions (`getProxyStatus`, `banAccount`, etc.). Calls to token generation functions will fail.
- **Deployment**: Runs as a single container (`ytdlp-ops-management`) and exposes its port directly to the host (e.g., port `9091`), bypassing Envoy.
- **`worker`**: The primary workhorse for token and `info.json` generation.
- **Purpose**: Handles all token generation requests.
- **Behavior**: Implements the full API, but its management functions are scoped to its own `server_identity`.
- **Deployment**: Runs as a scalable service (`ytdlp-ops-worker`) behind the Envoy load balancer (e.g., port `9080`).
- **`all-in-one`** (Default): A single instance that performs both management and worker roles. Ideal for local development or small-scale deployments.
This architecture allows for a robust, federated system where workers manage their own resources locally, while a central service provides a global view for management and monitoring.
---
## 1. Account Lifecycle Management (Cooldown / Resting)
**Goal:** To prevent excessive use and subsequent blocking of accounts by providing them with "rest" periods after intensive work.
### How It Works:
The account lifecycle consists of three states:
- **`ACTIVE`**: The account is active and used for tasks. An activity timer starts on its first successful use.
- **`RESTING`**: If an account has been `ACTIVE` for longer than the configured limit, the `AccountManager` automatically moves it to a "resting" state. The Airflow worker will not select it for new jobs.
- **Return to `ACTIVE`**: After the cooldown period ends, the `AccountManager` automatically returns the account to the `ACTIVE` state, making it available again.
### Configuration:
These parameters are configured when starting the `ytdlp-ops-server`.
- `--account-active-duration-min`: The "action time" in **minutes** an account can be continuously active before being moved to `RESTING`.
- **Default:** `30` (minutes).
- `--account-cooldown-duration-min`: The "rest time" in **minutes** an account must remain in the `RESTING` state.
- **Default:** `60` (minutes).
**Where to Configure:**
The parameters are passed as command-line arguments to the server. When using Docker Compose, this is done in `airflow/docker-compose-ytdlp-ops.yaml`:
```yaml
command:
# ... other parameters
- "--account-active-duration-min"
- "${ACCOUNT_ACTIVE_DURATION_MIN:-30}"
- "--account-cooldown-duration-min"
- "${ACCOUNT_COOLDOWN_DURATION_MIN:-60}"
```
You can change the default values by setting the `ACCOUNT_ACTIVE_DURATION_MIN` and `ACCOUNT_COOLDOWN_DURATION_MIN` environment variables in your `.env` file.
**Relevant Files:**
- `server_fix/account_manager.py`: Contains the core logic for state transitions.
- `ytdlp_ops_server_fix.py`: Parses the command-line arguments.
- `airflow/docker-compose-ytdlp-ops.yaml`: Passes the arguments to the server container.
---
## 2. Smart Banning Strategy
**Goal:** To avoid unfairly banning good proxies. The problem is often with the account, not the proxy it's using.
### How It Works:
#### Stage 1: Ban the Account First
- When a serious, bannable error occurs (e.g., `BOT_DETECTED` or `SOCKS5_CONNECTION_FAILED`), the system penalizes **only the account** that caused the error.
- For the proxy, this error is simply recorded as a single failure, but the proxy itself is **not banned** and remains in rotation.
#### Stage 2: Ban the Proxy via "Sliding Window"
- A proxy is banned automatically only if it shows **systematic failures with DIFFERENT accounts** over a short period.
- This is a reliable indicator that the proxy itself is the problem. The `ProxyManager` on the server tracks this and automatically bans such a proxy.
### Configuration:
These parameters are **hard-coded** as constants in the source code. Changing them requires editing the file.
**Where to Configure:**
- **File:** `server_fix/proxy_manager.py`
- **Constants** in the `ProxyManager` class:
- `FAILURE_WINDOW_SECONDS`: The time window in seconds for analyzing failures.
- **Default:** `3600` (1 hour).
- `FAILURE_THRESHOLD_COUNT`: The minimum total number of failures to trigger a check.
- **Default:** `3`.
- `FAILURE_THRESHOLD_UNIQUE_ACCOUNTS`: The minimum number of **unique accounts** that must have failed with the proxy to trigger a ban.
- **Default:** `3`.
**Relevant Files:**
- `server_fix/proxy_manager.py`: Contains the sliding window logic and constants.
- `airflow/dags/ytdlp_ops_worker_per_url.py`: The `handle_bannable_error_callable` function implements the "account-only" ban policy.
---
### Account Statuses Explained
You can view the status of all accounts using the `ytdlp_mgmt_proxy_account` DAG. The statuses have the following meanings:
- **`ACTIVE`**: The account is healthy and available for use. An account is considered `ACTIVE` by default if it has no specific status set.
- **`BANNED`**: The account has been temporarily disabled due to repeated failures (e.g., `BOT_DETECTED` errors) or by a manual ban. The status will show the time remaining until it automatically becomes `ACTIVE` again (e.g., `BANNED (active in 55m)`).
- **`RESTING`**: The account has been used for an extended period and is in a mandatory "cooldown" period to prevent burnout. The status will show the time remaining until it becomes `ACTIVE` again (e.g., `RESTING (active in 25m)`).
- **(Blank Status)**: In older versions, an account that had only ever failed (and never succeeded) might appear with a blank status. This has been fixed; these accounts are now correctly shown as `ACTIVE`.
---
## 3. End-to-End Rotation Flow: How It All Works Together
This section describes the step-by-step flow of how a worker gets assigned an account and a proxy for a single job, integrating all the management strategies described above.
1. **Worker Initialization (`ytdlp_ops_worker_per_url`)**
- The DAG run starts, triggered either by the orchestrator or by its previous successful run.
- The `pull_url_from_redis` task fetches a URL from the Redis `_inbox` queue.
2. **Account Selection (Airflow Worker)**
- The `assign_account` task is executed.
- It generates the full list of potential account IDs based on the `account_pool` (e.g., `my_prefix_01` to `my_prefix_50`).
- It connects to Redis and iterates through this list, checking the status of each account.
- It builds a new, temporary list containing only accounts that are **not** in a `BANNED` or `RESTING` state.
- If the resulting list of active accounts is empty, the worker fails (unless auto-creation is enabled).
- It then takes the filtered list of active accounts and uses **`random.choice()`** to select one.
- The chosen `account_id` is passed to the next task.
3. **Proxy Selection (`ytdlp-ops-server`)**
- The `get_token` task runs, sending the randomly chosen `account_id` in a Thrift RPC call to the `ytdlp-ops-server`.
- On the server, the `ProxyManager` is asked for a proxy. This happens on **every single request**.
- The `ProxyManager` performs the following steps on every call to ensure it has the most up-to-date information:
a. **Query Redis:** It fetches the *entire* current state of all proxies from Redis. This ensures it immediately knows about any status changes (e.g., a ban) made by other workers.
b. **Rebuild Active List:** It rebuilds its internal in-memory list of proxies, including only those with an `ACTIVE` status.
c. **Apply Sliding Window Ban:** It checks the recent failure history for each active proxy. If a proxy has failed too many times with different accounts, it is banned on the spot, even if its status was `ACTIVE`.
d. **Select Proxy:** It selects the next available proxy from the final, filtered active list using a **round-robin** index.
e. **Return Proxy:** It returns the selected `proxy_url` to be used for the token generation task.
- **Worker Affinity**: Crucially, even though workers may share a proxy state in Redis under a common `server_identity`, each worker instance will **only ever use the proxies it was configured with at startup**. It uses Redis to check the status of its own proxies but will ignore other proxies in the shared pool.
4. **Execution and Reporting**
- The server now has both the `account_id` (from Airflow) and the `proxy_url` (from its `ProxyManager`).
- It proceeds with the token generation process using these resources.
- Upon completion (success or failure), it reports the outcome to Redis, updating the status for both the specific account and proxy that were used. This affects their failure counters, cooldown timers, etc., for the next run.
This separation of concerns is key:
- **The Airflow worker (`assign_account` task)** is responsible for the **random selection of an active account**, while maintaining affinity (re-using the same account after a success).
- **The `ytdlp-ops-server`** is responsible for the **round-robin selection of an active proxy**.
---
## 4. Automatic Account Ban on Consecutive Failures
**Goal:** To automatically remove accounts from rotation that consistently cause non-bannable errors (e.g., incorrect password, authorization issues).
### How It Works:
- The `AccountManager` tracks the number of **consecutive** failures for each account.
- On any successful operation, this counter is reset.
- If the number of consecutive failures reaches a set threshold, the account is automatically banned for a specified duration.
### Configuration:
These parameters are set in the `AccountManager` constructor.
**Where to Configure:**
- **File:** `server_fix/account_manager.py`
- **Parameters** in the `__init__` method of `AccountManager`:
- `failure_threshold`: The number of consecutive failures before a ban.
- **Default:** `5`.
- `ban_duration_s`: The duration of the ban in seconds.
- **Default:** `3600` (1 hour).
---
## 5. Monitoring and Recovery
### How to Check Statuses
The **`ytdlp_mgmt_proxy_account`** DAG is the primary tool for monitoring the health of your resources. It connects directly to the **management service** to perform actions.
- **DAG ID:** `ytdlp_mgmt_proxy_account`
- **How to Use:** Trigger the DAG from the Airflow UI. Ensure the `management_host` and `management_port` parameters are correctly set to point to your `ytdlp-ops-management` service instance. To get a full overview, set the parameters:
- `entity`: `all`
- `action`: `list`
- **Result:** The DAG log will display tables with the current status of all accounts and proxies. For `BANNED` or `RESTING` accounts, it shows the time remaining until they become active again (e.g., `RESTING (active in 45m)`). For proxies, it highlights which proxy is `(next)` in the round-robin rotation for a specific worker.
### Worker vs. Management Service Roles in Automatic State Changes
It is important to understand the distinct roles each service plays in the automatic state management of accounts and proxies. The system uses a reactive, "on-read" update mechanism.
- **The `worker` service is proactive.** It is responsible for putting resources into a "bad" state.
- When a worker encounters too many failures with an account, it moves the account to `BANNED`.
- When an account's activity timer expires, the worker moves it to `RESTING`.
- When a proxy fails the sliding window check during a token request, the worker bans it.
- **The `management` service is reactive but crucial for recovery.** It is responsible for taking resources out of a "bad" state.
- The logic to check if a ban has expired or a rest period is over is located in the `getAccountStatus` and `getProxyStatus` methods.
- This means an account or proxy is only returned to an `ACTIVE` state **when its status is queried**.
- Since the `ytdlp_mgmt_proxy_account` DAG calls these methods on the `management` service, running this DAG is the primary mechanism for automatically clearing expired bans and rest periods.
In summary, workers put resources into timeout, and the management service (when queried) brings them back. This makes periodic checks with the management DAG important for overall system health and recovery.
### Important Note on Unbanning Proxies
When a proxy is unbanned (either individually via `unban` or collectively via `unban_all`), the system performs two critical actions:
1. It sets the proxy's status back to `ACTIVE`.
2. It **deletes the proxy's entire failure history** from Redis.
This second step is crucial. Without it, the `ProxyManager`'s "Sliding Window" check would see the old failures, immediately re-ban the "active" proxy on its next use, and lead to a `NO_ACTIVE_PROXIES` error. Clearing the history ensures that an unbanned proxy gets a truly fresh start.
### What Happens When All Accounts Are Banned or Resting?
If the entire pool of accounts becomes unavailable (either `BANNED` or `RESTING`), the system will effectively pause by default.
- The `ytdlp_ops_worker_per_url` DAG will fail at the `assign_account` step with an `AirflowException` because the active account pool will be empty.
- This will stop the processing loops. The system will remain paused until accounts are either manually unbanned or their ban/rest timers expire, at which point you can re-start the processing loops using the `ytdlp_ops_orchestrator` DAG.
- The DAG graph for `ytdlp_ops_worker_per_url` now explicitly shows tasks for `assign_account`, `get_token`, `ban_account`, `retry_get_token`, etc., making the process flow and failure points much clearer.
The system can be configured to automatically create new accounts to prevent processing from halting completely.
#### Automatic Account Creation on Exhaustion
- **Goal**: Ensure the processing pipeline continues to run even if all accounts in the primary pool are temporarily banned or resting.
- **How it works**: If the `auto_create_new_accounts_on_exhaustion` parameter is set to `True` and the account pool is defined using a prefix (not an explicit list), the system will generate a new, unique account ID when it finds the active pool empty.
- **New Account Naming**: New accounts are created with the format `{prefix}-auto-{unique_id}`.
- **Configuration**:
- **Parameter**: `auto_create_new_accounts_on_exhaustion`
- **Where to set**: In the `ytdlp_ops_orchestrator` DAG configuration when triggering a run.
- **Default**: `True`.
---
## 6. Failure Handling and Retry Policy
**Goal:** To provide flexible control over how the system behaves when a worker encounters a "bannable" error (e.g., `BOT_DETECTED`).
### How It Works
When a worker's `get_token` task fails with a bannable error, the system's behavior is determined by the `on_bannable_failure` policy, which can be configured when starting the `ytdlp_ops_orchestrator`.
### Configuration
- **Parameter**: `on_bannable_failure`
- **Where to set**: In the `ytdlp_ops_orchestrator` DAG configuration.
- **Options**:
- `stop_loop` (Strictest):
- The account used is banned.
- The URL is marked as failed in the `_fail` Redis hash.
- The worker's processing loop is **stopped**. The lane becomes inactive.
- `retry_with_new_account` (Default, Most Resilient):
- The failing account is banned.
- The worker immediately retries the **same URL** with a new, unused account from the pool.
- If the retry succeeds, the worker continues its loop to the next URL.
- If the retry also fails, the second account **and the proxy** are also banned, and the worker's loop is stopped.
- `retry_and_ban_account_only`:
- Similar to `retry_with_new_account`, but on the second failure, it bans **only the second account**, not the proxy.
- This is useful when you trust your proxies but want to aggressively cycle through failing accounts.
- `retry_without_ban` (Most Lenient):
- The worker retries with a new account, but **no accounts or proxies are ever banned**.
- This policy is useful for debugging or when you are confident that failures are transient and not the fault of the resources.
This policy allows the system to be resilient to single account failures without losing the URL, while providing granular control over when to ban accounts and/or proxies if the problem persists.
---
## 7. Worker DAG Logic (`ytdlp_ops_worker_per_url`)
This DAG is the "workhorse" of the system. It is designed as a self-sustaining loop to process one URL per run. The logic for handling failures and retries is now explicitly visible in the DAG's task graph.
### Tasks and Their Purpose:
- **`pull_url_from_redis`**: Fetches one URL from the Redis `_inbox` queue. If the queue is empty, the DAG run is skipped, stopping this worker's processing "lane".
- **`assign_account`**: Selects an account for the job. It maintains **account affinity** by re-using the same account from the previous successful run in its "lane". If it's the first run or the previous run failed, it picks a random active account.
- **`get_token`**: The primary attempt to get tokens and `info.json` by calling the `ytdlp-ops-server`.
- **`handle_bannable_error_branch`**: A branching task that runs if `get_token` fails. It inspects the error and decides the next step based on the `on_bannable_failure` policy.
- **`ban_account_and_prepare_for_retry`**: If a retry is permitted, this task bans the failed account and selects a new one.
- **`retry_get_token`**: A second attempt to get the token using the new account.
- **`ban_second_account_and_proxy`**: If the retry also fails, this task bans the second account and the proxy that was used.
- **`download_and_probe`**: If `get_token` or `retry_get_token` succeeds, this task uses `yt-dlp` to download the media and `ffmpeg` to verify that the downloaded file is a valid media file.
- **`mark_url_as_success`**: If `download_and_probe` succeeds, this task records the successful result in the Redis `_result` hash.
- **`handle_generic_failure`**: If any task fails non-recoverably, this task records the detailed error information in the Redis `_fail` hash.
- **`decide_what_to_do_next`**: A final branching task that decides whether to continue the loop (`trigger_self_run`), stop it gracefully (`stop_loop`), or mark it as failed (`fail_loop`).
- **`trigger_self_run`**: The task that actually triggers the next DAG run, creating the continuous loop.
---
## 8. Proxy State Lifecycle in Redis
This section details how a proxy's state (e.g., `ACTIVE`, `BANNED`) is managed and persisted in Redis. The system uses a "lazy initialization" pattern, meaning a proxy's state is only written to Redis when it is first needed.
### Step 1: Configuration and In-Memory Initialization
The server first learns about the list of available proxies from its startup configuration, not from Redis.
1. **Source of Truth**: Proxies are defined in the `.env` file (e.g., `CAMOUFOX_PROXIES`, `SOCKS5_SOCK_SERVER_IP`).
2. **Injection**: The `airflow/generate_envoy_config.py` script aggregates these into a single list, which is passed to the `ytdlp-ops-server` via the `--proxies` command-line argument during Docker Compose startup.
3. **In-Memory State**: The `ProxyManager` in `server_fix/proxy_manager.py` receives this list and holds it in memory. At this point, Redis is not involved.
### Step 2: First Write to Redis (Lazy Initialization)
A proxy's state is only persisted to Redis the first time it is actively managed or queried.
* **Trigger**: This typically happens on the first API call that requires proxy state, such as `getProxyStatus`.
* **Action**: The `ProxyManager` checks Redis for a hash with the key `proxies:<server_identity>` (e.g., `proxies:ytdlp-ops-airflow-service`).
* **Initialization**: If the key does not exist, the `ProxyManager` iterates through its in-memory list of proxies and writes each one to the Redis hash with a default state of `ACTIVE`.
### Step 3: Runtime Updates (Success and Failure)
The proxy's state in Redis is updated in real-time based on the outcome of token generation tasks.
* **On Success**: When a task using a proxy succeeds, `ProxyManager.report_success()` is called. This updates the proxy's `success_count` and `last_success_timestamp` in the Redis hash.
* **On Failure**: When a task fails, `ProxyManager.report_failure()` is called.
1. A record of the failure (including the account ID and job ID) is added to a separate Redis sorted set with the key `proxy_failures:<proxy_url>`. This key has a TTL and is used for the sliding window ban strategy.
2. The proxy's `failure_count` and `last_failure_timestamp` are updated in the main Redis hash.
* **Automatic Ban**: If the conditions for the "Sliding Window" ban are met (too many failures from different accounts in a short time), `ProxyManager.ban_proxy()` is called, which updates the proxy's `status` to `BANNED` in the Redis hash.
### Step 4: Observation and Manual Control
You can view and modify the proxy states stored in Redis using the provided management tools.
* **Observation**:
* **Airflow DAG**: The `ytdlp_mgmt_proxy_account` DAG (`action: list_statuses`, `entity: proxy`).
* **CLI Client**: The `proxy_manager_client.py` script (`list` command).
* These tools call the `getProxyStatus` API endpoint, which reads directly from the `proxies:<server_identity>` hash in Redis.
* **Manual Control**:
* The same tools provide `ban`, `unban`, and `reset` actions.
* These actions call API endpoints that directly modify the `status` field for a proxy in the `proxies:<server_identity>` Redis hash.
* The `delete_from_redis` action in the DAG provides a way to completely remove a proxy's state and failure history from Redis, forcing it to be re-initialized as `ACTIVE` on its next use.
### Summary of Redis Keys
| Redis Key Pattern | Type | Purpose |
| :--- | :--- | :--- |
| `proxies:<server_identity>` | Hash | The primary store for proxy state. Maps `proxy_url` to a JSON string containing its status (`ACTIVE`/`BANNED`), success/failure counts, and timestamps. |
| `proxy_failures:<proxy_url>` | Sorted Set | A temporary log of recent failures for a specific proxy, used by the sliding window ban logic. The score is the timestamp of the failure. |

View File

@ -0,0 +1,97 @@
# YTDLP Client Side Integration
This document describes how to integrate and use the YTDLP client with the token service.
## Build
1. **Pull, configure and start server if needed:**
```bash
cd /srv/airflow_worker/
docker login pangramia # It used to be performed beforehand otherwise ask pull password
docker compose -f docker-compose-ytdlp-ops.yaml up -d
docker compose -f docker-compose-ytdlp-ops.yaml logs -f
```
The server is bound to a certain proxy, like "socks5://sslocal-rust-1084:1084".
Also check that redis in bind to 0.0.0.0 in config
2. **Build airflow-worker with custom dependencies:**
```bash
cd /srv/airflow_worker/
docker compose build airflow-worker
docker compose down airflow-worker
docker compose up -d --no-deps airflow-worker
```
3. **Test the built-in client:**
```bash
# Show client help
docker compose exec airflow-worker python /app/ytdlp_ops_client.py --help
# Get token and info.json
docker compose exec airflow-worker python /app/ytdlp_ops_client.py --host 16.162.82.212 --port 9080 getToken --url 'https://www.youtube.com/watch?v=vKTVLpmvznI'
# List formats using saved info.json
docker compose exec airflow-worker yt-dlp --load-info-json "latest.json" -F
# Simulate download using saved info.json
docker compose exec airflow-worker yt-dlp --load-info-json "latest.json" --proxy "socks5://89.253.221.173:1084" --simulate --verbose
# Extract metadata and download URLs using jq
docker compose exec airflow-worker jq -r '"Title: \(.title)", "Date: \(.upload_date | strptime("%Y%m%d") | strftime("%Y-%m-%d"))", "Author: \(.uploader)", "Length: \(.duration_string)", "", "Download URLs:", (.formats[] | select(.vcodec != "none" or .acodec != "none") | .url)' latest.json
```
4. **Test Airflow task:**
To run the `ytdlp_client_dag_v2.1` DAG:
Set up required Airflow variables
```bash
docker compose exec airflow-worker airflow variables set DOWNLOAD_OPTIONS '{"formats": ["bestvideo[height<=1080]+bestaudio/best[height<=1080]"]}'
docker compose exec airflow-worker airflow variables set DOWNLOADS_TEMP '/opt/airflow/downloadfiles'
docker compose exec airflow-worker airflow variables set DOWNLOADS_PATH '/opt/airflow/downloadfiles'
docker compose exec airflow-worker airflow variables list
docker compose exec airflow-worker airflow variables set TOKEN_TIMEOUT '300'
docker compose exec airflow-worker airflow connections import /opt/airflow/config/docker_hub_repo.json
docker compose exec airflow-worker airflow connections delete redis_default
docker compose exec airflow-worker airflow connections import /opt/airflow/config/redis_default_conn.json
```
**Using direct connection with task test:**
```bash
docker compose exec airflow-worker airflow db reset
docker compose exec airflow-worker airflow dags reserialize
docker compose exec airflow-worker airflow dags list
docker compose exec airflow-worker airflow dags list-import-errors
docker compose exec airflow-worker airflow tasks test ytdlp_client_dag_v2.1 get_token $(date -u +"%Y-%m-%dT%H:%M:%S+00:00") --task-params '{"url": "https://www.youtube.com/watch?v=sOlTX9uxUtM", "redis_enabled": false, "service_ip": "16.162.82.212", "service_port": 9080}'
docker compose exec airflow-worker yt-dlp --load-info-json /opt/airflow/downloadfiles/latest.json --proxy "socks5://89.253.221.173:1084" --verbose --simulate
docker compose exec airflow-worker airflow dags list-runs -d ytdlp_client_dag
```
or deploy using trigger
```bash
docker compose exec airflow-worker airflow dags list
docker compose exec airflow-worker airflow dags unpause ytdlp_client_dag_v2.1
// Try UI or recheck if works from server deploy
docker compose exec airflow-worker airflow dags trigger ytdlp_client_dag_v2.1 -c '{"url": "https://www.youtube.com/watch?v=sOlTX9uxUtM", "redis_enabled": false, "service_ip": "16.162.82.212", "service_port": 9080}'
```
Check Redis for stored data by videoID
```bash
docker compose exec redis redis-cli -a XXXXXX -h 89.253.221.173 -p 52909 HGETALL "token_info:sOlTX9uxUtM" | jq -R -s 'split("\n") | del(.[] | select(. == "")) | [.[range(0;length;2)]]'
```

93
airflow/README.md Normal file
View File

@ -0,0 +1,93 @@
# Airflow DAGs Explanation
## ytdlp_ops_worker_per_url.py
This DAG processes a single YouTube URL passed via DAG run configuration. It's the "Worker" part of a Sensor/Worker pattern and uses the TaskFlow API to implement worker affinity, ensuring all tasks for a single URL run on the same machine.
### DAG Structure and Flow
**Legend:**
* `TaskName`: An Airflow task.
* `-->`: Successful execution flow.
* `--(fail)-->`: Execution flow triggered by the failure of the preceding task.
* `--(success)-->`: Execution flow triggered only if the preceding task succeeds.
* `[Group: GroupName]`: A TaskGroup containing sub-tasks.
**Execution Flow:**
1. **Start:** The DAG run is triggered (e.g., by the dispatcher).
2. **`get_url_and_assign_account`**
* Purpose: Gets the URL and assigns the first account.
* Flow:
* `--> get_token` (Success path)
* `--(fail)--> handle_bannable_error_branch` (Failure path)
3. **`get_token`** (Initial attempt)
* Purpose: Calls the Thrift service to get a token using the assigned account.
* Flow:
* `--(success)--> download_and_probe` (Success path, passed via `coalesce_token_data`)
* `--(fail)--> handle_bannable_error_branch` (Failure path)
4. **`handle_bannable_error_branch`**
* Purpose: Checks the error from `get_token` and decides the next step based on error type and policy.
* Flow (Branches):
* If bannable error & retry policy:
* `--> [Group: ban_account_and_prepare_for_retry]`
* `--> check_sliding_window_for_ban`
* `--> ban_account_task` (if ban criteria met)
* `--> skip_ban_task` (if ban criteria not met)
* `--> assign_new_account_for_retry` (after group)
* `--> retry_get_token` (using new account)
* If bannable error & stop policy:
* `--> ban_and_fail` (Bans account and fails DAG)
* If connection error & retry policy:
* `--> assign_new_account_for_retry`
* `--> retry_get_token`
* If non-bannable/connection error:
* (No specific path defined, DAG likely fails)
5. **`retry_get_token`**
* Purpose: Calls the Thrift service again using the new account.
* Flow:
* `--(success)--> download_and_probe` (Success path, passed via `coalesce_token_data`)
* `--(fail)--> handle_generic_failure` (Failure path)
6. **`coalesce_token_data`**
* Purpose: Selects the successful token data from either the initial or retry attempt.
* Flow:
* `--> download_and_probe` (Success path)
7. **`download_and_probe`**
* Purpose: Uses the token data to download the media file and probes it with ffmpeg.
* Flow:
* `--(success)--> mark_url_as_success` (Success path)
* `--(fail)--> handle_generic_failure` (Failure path)
8. **`mark_url_as_success`**
* Purpose: Records the successful processing result.
* Flow:
* `--(success)--> continue_processing_loop` (Success path)
* `--(fail)--> handle_generic_failure` (Failure path)
9. **`continue_processing_loop`**
* Purpose: Triggers a new run of the dispatcher DAG.
* Flow:
* (End of this DAG run)
10. **`handle_generic_failure`**
* Purpose: Catches any unhandled failures and marks the DAG run as failed.
* Flow:
* (End of this DAG run, marked as failed)
### Purpose of Orchestrator and Dispatcher
The system uses separate orchestrator and dispatcher components for several key reasons:
1. **Worker Affinity/Pinning:** One of the main reasons is to ensure that all tasks related to processing a single URL run on the same worker machine. This is crucial because the `get_token` task generates an `info.json` file that contains session-specific data (like cookies and tokens). The subsequent `download_and_probe` task needs to use this exact `info.json` file. By using a dedicated worker DAG (`ytdlp_ops_worker_per_url.py`) with worker affinity, we guarantee that the file system where `info.json` is stored is accessible to both tasks.
2. **Scalability and Load Distribution:** The dispatcher can monitor queues or sources of URLs and trigger individual worker DAG runs. This decouples the discovery of work from the execution of work, allowing for better scaling and management of processing load across multiple workers.
3. **Fault Isolation:** If processing a single URL fails, it only affects that specific worker DAG run, not the entire pipeline. The dispatcher can continue to trigger other worker runs for other URLs.
4. **Flexibility:** The orchestrator/dispatcher pattern allows for more complex scheduling, prioritization, and routing logic to be implemented in the dispatcher, while keeping the worker DAG focused on the core processing steps for a single unit of work.

3241
airflow/airflow.cfg Normal file

File diff suppressed because it is too large Load Diff

1253
airflow/answer.json Normal file

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,144 @@
# Airflow remote DL worker configuration.
# This file should be used on a remote machine to run a download worker.
# It requires a master Airflow instance running with services exposed.
#
# Before running, create a .env file in this directory with:
# MASTER_HOST_IP=... a.b.c.d ... # IP address of the machine running docker-compose-master.yaml
# POSTGRES_PASSWORD=... # The password for the PostgreSQL database from the master compose file
# REDIS_PASSWORD=... # The password for Redis from the master compose file
# AIRFLOW_UID=... # User ID for file permissions, should match master
---
x-airflow-common:
&airflow-common
# This should point to the same image used by the master.
# If you built a custom image for master, you need to push it to a registry
# and reference it here.
image: ${AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest}
build: .
# Add extra hosts here to allow workers to resolve other hosts by name.
# This section is auto-generated from cluster.yml
extra_hosts:
{% for host_name, host_ip in all_hosts.items() %}
- "{{ host_name }}:{{ host_ip }}"
{% endfor %}
env_file:
- .env
environment:
&airflow-common-env
# Airflow Core
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
AIRFLOW__CORE__FERNET_KEY: '' # Should be same as master, but worker does not need it.
# Backend connections - These should point to the master node
# Set MASTER_HOST_IP, POSTGRES_PASSWORD, and REDIS_PASSWORD in your .env file
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}@${MASTER_HOST_IP}:5432/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT}@${MASTER_HOST_IP}:52909/0
AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}@${MASTER_HOST_IP}:5432/airflow
# Remote Logging - connection is fetched from DB, which is on master
AIRFLOW__LOGGING__REMOTE_LOGGING: "True"
AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs"
AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default
AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False"
AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py"
volumes:
# Mount dags to get any utility scripts, but the worker will pull the DAG from the DB
- ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
# Mount logs locally in case remote logging fails
- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
# Mount config for local settings and other configurations
- ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
# Mount download directories
- ${AIRFLOW_PROJ_DIR:-.}/downloadfiles:/opt/airflow/downloadfiles
- ${AIRFLOW_PROJ_DIR:-.}/addfiles:/opt/airflow/addfiles
- ${AIRFLOW_PROJ_DIR:-.}/inputfiles:/opt/airflow/inputfiles
# Use AIRFLOW_UID and AIRFLOW_GID from .env file to fix permission issues.
user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-0}"
services:
airflow-worker:
<<: *airflow-common
container_name: airflow-dl-worker-1
hostname: ${HOSTNAME:-dl001}
# The worker now listens on the generic queue AND its own dedicated queue.
# The hostname is dynamically inserted into the queue name.
command: airflow celery worker -q queue-dl,queue-dl-${HOSTNAME:-dl001}
deploy:
resources:
limits:
# Increased from 4G to 8G to support higher memory per child process.
memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT:-8G}
reservations:
memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV:-2G}
healthcheck:
test:
- "CMD-SHELL"
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-dl@$$(hostname)"'
interval: 30s
timeout: 30s
retries: 5
start_period: 30s
environment:
<<: *airflow-common-env
HOSTNAME: ${HOSTNAME:-dl001} # Explicitly set inside container
DUMB_INIT_SETSID: "0"
AIRFLOW__CELERY__WORKER_QUEUES: "queue-dl,queue-dl-${HOSTNAME:-dl001}"
AIRFLOW__CELERY__WORKER_TAGS: "dl"
AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
AIRFLOW__CELERY__WORKER_CONCURRENCY: ${AIRFLOW_WORKER_DOWNLOAD_CONCURRENCY:-16}
AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
AIRFLOW__CELERY__WORKER_NAME: "worker-dl@%h"
AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
# Increased from 256MB to 512MB for memory-intensive yt-dlp tasks.
# This value is in KB. 512 * 1024 = 524288.
AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "524288" # 512MB
# The hostname is now managed by Docker Compose to ensure uniqueness when scaling.
# It will be generated based on project, service, and replica number (e.g., airflow-airflow-dl-worker-1).
# hostname: "dl-worker-${HOSTNAME_SUFFIX:-$$(hostname)}"
ports:
- "8793:8793"
networks:
- default
- proxynet
restart: always
airflow-triggerer:
<<: *airflow-common
container_name: airflow-dl-triggerer-1
hostname: ${HOSTNAME}
command: triggerer
healthcheck:
test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
interval: 30s
timeout: 30s
retries: 5
start_period: 60s
environment:
<<: *airflow-common-env
PYTHONASYNCIODEBUG: 1
DUMB_INIT_SETSID: 0
restart: always
docker-socket-proxy:
profiles:
- disabled
image: tecnativa/docker-socket-proxy:0.1.1
environment:
CONTAINERS: 1
IMAGES: 1
AUTH: 1
POST: 1
privileged: true
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
restart: always
networks:
proxynet:
name: airflow_proxynet
external: true

View File

@ -0,0 +1,534 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
#
# WARNING: This configuration is for local development. Do not use it in a production deployment.
#
# This configuration supports basic configuration using environment variables or an .env file
# The following variables are supported:
#
# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
# Default: apache/airflow:2.10.5
# AIRFLOW_UID - User ID in Airflow containers
# Default: 50000
# AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed.
# Default: .
# Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
#
# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
# Default: airflow
# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
# Default: airflow
# _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
# Use this option ONLY for quick checks. Installing requirements at container
# startup is done EVERY TIME the service is started.
# A better way is to build a custom image or extend the official image
# as described in https://airflow.apache.org/docs/docker-stack/build.html.
# Default: ''
#
# Feel free to modify this file to suit your needs.
---
name: airflow-master
x-minio-common: &minio-common
image: quay.io/minio/minio:RELEASE.2025-07-23T15-54-02Z
command: server --console-address ":9001" http://minio{1...3}/data{1...2}
expose:
- "9000"
- "9001"
networks:
- proxynet
env_file:
- .env
environment:
MINIO_ROOT_USER: ${MINIO_ROOT_USER:-admin}
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-0153093693-0009}
healthcheck:
test: ["CMD", "mc", "ready", "local"]
interval: 5s
timeout: 5s
retries: 5
restart: always
x-airflow-common:
&airflow-common
# In order to add custom dependencies or upgrade provider packages you can use your extended image.
# This will build the image from the Dockerfile in this directory and tag it.
image: ${AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest}
build: .
# Add extra hosts here to allow the master services (webserver, scheduler) to resolve
# the hostnames of your remote DL workers. This is crucial for fetching logs.
# Format: - "hostname:ip_address"
# IMPORTANT: This section is auto-generated from cluster.yml
extra_hosts:
- "af-test:89.253.223.97"
- "dl001:109.107.189.106"
env_file:
- .env
networks:
- proxynet
environment:
&airflow-common-env
AIRFLOW__CORE__PARALLELISM: 64
AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG: 32
AIRFLOW__SCHEDULER__PARSING_PROCESSES: 4
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}@postgres/airflow
AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}@postgres/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT}@redis:6379/0
AIRFLOW__CORE__FERNET_KEY: ''
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
# yamllint disable rule:line-length
# Use simple http server on scheduler for health checks
# See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
# yamllint enable rule:line-length
AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
# WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
# for other purpose (development, test and especially production usage) build/extend Airflow image.
#_PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker apache-airflow-providers-http thrift>=0.16.0,<=0.20.0 backoff>=2.2.1 python-dotenv==1.0.1 psutil>=5.9.0} # The following line can be used to set a custom config file, stored in the local config folder
# If you want to use it, outcomment it and replace airflow.cfg with the name of your config file
AIRFLOW__LOGGING__REMOTE_LOGGING: "True"
AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs"
AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default
AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False"
AIRFLOW__LOGGING__REMOTE_LOG_FORMAT: "[%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s"
AIRFLOW__LOGGING__LOG_LEVEL: "INFO"
AIRFLOW__LOGGING__LOG_FILENAME_TEMPLATE: "{{ ti.dag_id }}/{{ ti.run_id }}/{{ ti.task_id }}/attempt={{ try_number }}.log"
AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py"
volumes:
- ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
- ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
- ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
- ${AIRFLOW_PROJ_DIR:-.}/downloadfiles:/opt/airflow/downloadfiles
- ${AIRFLOW_PROJ_DIR:-.}/addfiles:/opt/airflow/addfiles
- ${AIRFLOW_PROJ_DIR:-.}/inputfiles:/opt/airflow/inputfiles
user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-0}"
depends_on:
&airflow-common-depends-on
redis:
condition: service_healthy
postgres:
condition: service_healthy
nginx-minio-lb:
condition: service_healthy
services:
postgres:
image: postgres:13
env_file:
- .env
networks:
- proxynet
environment:
POSTGRES_USER: airflow
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}
POSTGRES_DB: airflow
volumes:
- postgres-db-volume:/var/lib/postgresql/data
ports:
- "5432:5432"
healthcheck:
test: ["CMD", "pg_isready", "-U", "airflow"]
interval: 10s
retries: 5
start_period: 5s
restart: always
redis:
# Redis is limited to 7.2-bookworm due to licencing change
# https://redis.io/blog/redis-adopts-dual-source-available-licensing/
image: redis:7.2-bookworm
env_file:
- .env
networks:
- proxynet
command: sh -c "redis-server --requirepass ${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} --bind 0.0.0.0 --save 60 1 --loglevel warning --appendonly yes"
volumes:
- ./redis-data:/data
expose:
- 6379
ports:
- "52909:6379"
healthcheck:
test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT}", "ping"]
interval: 10s
timeout: 30s
retries: 50
start_period: 30s
restart: always
redis-proxy-account-clear:
image: redis:7.2-bookworm
container_name: redis-proxy-account-clear
env_file:
- .env
networks:
- proxynet
command: >
sh -c "
echo 'Clearing proxy and account statuses from Redis...';
redis-cli -h redis -a $${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} --scan --pattern 'proxy_status:*' | xargs -r redis-cli -h redis -a $${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} DEL;
redis-cli -h redis -a $${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} --scan --pattern 'account_status:*' | xargs -r redis-cli -h redis -a $${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} DEL;
echo 'Redis cleanup complete.'
"
depends_on:
redis:
condition: service_healthy
minio1:
<<: *minio-common
hostname: minio1
volumes:
- ./minio-data/1/1:/data1
- ./minio-data/1/2:/data2
minio2:
<<: *minio-common
hostname: minio2
volumes:
- ./minio-data/2/1:/data1
- ./minio-data/2/2:/data2
depends_on:
minio1:
condition: service_started
minio3:
<<: *minio-common
hostname: minio3
volumes:
- ./minio-data/3/1:/data1
- ./minio-data/3/2:/data2
depends_on:
minio2:
condition: service_started
nginx-minio-lb:
image: nginx:1.19.2-alpine
hostname: nginx-minio-lb
networks:
- proxynet
command: sh -c "apk add --no-cache curl >/dev/null 2>&1 && exec nginx -g 'daemon off;'"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
ports:
- "9000:9000"
- "9001:9001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9001/minio/health/live"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
depends_on:
minio1:
condition: service_healthy
minio2:
condition: service_healthy
minio3:
condition: service_healthy
restart: always
minio-init:
image: minio/mc
container_name: minio-init
networks:
- proxynet
depends_on:
nginx-minio-lb:
condition: service_healthy
entrypoint: >
/bin/sh -c "
set -e;
/usr/bin/mc alias set minio http://nginx-minio-lb:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD;
# Retry loop for bucket creation
MAX_ATTEMPTS=10
SUCCESS=false
# Use a for loop for robustness, as it's generally more portable than `until`.
for i in $$(seq 1 $$MAX_ATTEMPTS); do
# Check if the bucket exists. If so, we're done.
if /usr/bin/mc ls minio/airflow-logs > /dev/null 2>&1; then
echo 'MinIO bucket already exists.'
SUCCESS=true
break
fi
# If not, try to create it. If successful, we're done.
# We redirect output because `mc mb` can error if another process creates it in the meantime.
if /usr/bin/mc mb minio/airflow-logs > /dev/null 2>&1; then
echo 'MinIO bucket created.'
SUCCESS=true
break
fi
# If we reach here, both checks failed. Wait and retry.
echo "Attempt $$i/$$MAX_ATTEMPTS: Waiting for MinIO bucket..."
sleep 2
done
# After the loop, check if we succeeded.
if [ "$$SUCCESS" = "false" ]; then
echo "Failed to create MinIO bucket after $$MAX_ATTEMPTS attempts."
exit 1
fi
/usr/bin/mc anonymous set download minio/airflow-logs;
echo 'MinIO initialized: bucket airflow-logs created and policy set to download.';
"
env_file:
- .env
environment:
MINIO_ROOT_USER: ${MINIO_ROOT_USER:-admin}
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-0153093693-0009}
restart: on-failure
nginx-healthcheck:
image: nginx:alpine
container_name: nginx-healthcheck
networks:
- proxynet
ports:
- "8888:80"
restart: always
airflow-webserver:
<<: *airflow-common
command: webserver
ports:
- "8080:8080"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-scheduler:
<<: *airflow-common
command: scheduler
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-master-worker:
<<: *airflow-common
command: airflow celery worker -q main,default
healthcheck:
# yamllint disable rule:line-length
test:
- "CMD-SHELL"
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-master@$$(hostname)"'
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
environment:
<<: *airflow-common-env
# Required to handle warm shutdown of the celery workers properly
# See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
DUMB_INIT_SETSID: 0
AIRFLOW__CELERY__WORKER_QUEUES: "main,default"
AIRFLOW__CELERY__WORKER_TAGS: "master"
AIRFLOW__CELERY__WORKER_CONCURRENCY: "16"
AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
AIRFLOW__CELERY__WORKER_NAME: "worker-master@%h"
AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
# Max memory per child process before it's recycled. Helps prevent memory leaks.
# 256MB is sufficient for master worker tasks. DL workers use a higher limit.
AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "262144" # 256MB
hostname: ${HOSTNAME}
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-triggerer:
<<: *airflow-common
command: triggerer
healthcheck:
test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-init:
<<: *airflow-common
depends_on:
<<: *airflow-common-depends-on
minio-init:
condition: service_completed_successfully
redis-proxy-account-clear:
condition: service_completed_successfully
entrypoint: /bin/bash
# yamllint disable rule:line-length
command:
- -c
- |
# This container runs as root and is responsible for initializing the environment.
# It sets permissions on mounted directories to ensure the 'airflow' user (running with AIRFLOW_UID)
# can write to them. This is crucial for logs, dags, and plugins.
echo "Initializing permissions for Airflow directories..."
chown -R "${AIRFLOW_UID}:${AIRFLOW_GID}" /opt/airflow/dags /opt/airflow/logs /opt/airflow/plugins /opt/airflow/config /opt/airflow/downloadfiles /opt/airflow/addfiles /opt/airflow/inputfiles
echo "Permissions set."
if [[ -z "${AIRFLOW_UID}" ]]; then
echo
echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
echo "If you are on Linux, you SHOULD follow the instructions below to set "
echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
echo "For other operating systems you can get rid of the warning with manually created .env file:"
echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
echo
fi
# This container's job is to initialize the database, create a user, and import connections.
# Wait for db to be ready.
airflow db check --retry 30 --retry-delay 5
# Run database migrations.
echo "Running database migrations..."
airflow db upgrade
echo "Database migrations complete."
# Create the admin user if it doesn't exist.
# The '|| true' prevents the script from failing if the user already exists.
echo "Checking for and creating admin user..."
airflow users create \
--username "admin" \
--password "${AIRFLOW_ADMIN_PASSWORD:-admin_pwd_X9yZ3aB1cE5dF7gH}" \
--firstname Admin \
--lastname User \
--role Admin \
--email admin@example.com || true
echo "Admin user check/creation complete."
# Import connections from any .json file in the config directory.
echo "Searching for connection files in /opt/airflow/config..."
if [ -d "/opt/airflow/config" ] && [ -n "$(ls -A /opt/airflow/config/*.json 2>/dev/null)" ]; then
for conn_file in /opt/airflow/config/*.json; do
if [ -f "$$conn_file" ]; then
# Exclude files that are not meant to be Airflow connections.
if [ "$(basename "$$conn_file")" = "camoufox_endpoints.json" ]; then
echo "Skipping '$$conn_file' as it is not an Airflow connection file."
continue
fi
echo "Importing connections from $$conn_file"
airflow connections import "$$conn_file" || echo "Failed to import $$conn_file, but continuing."
fi
done
else
echo "No connection files found to import, or /opt/airflow/config is empty/missing."
fi
echo "Connection import process complete."
# yamllint enable rule:line-length
environment:
<<: *airflow-common-env
_AIRFLOW_DB_MIGRATE: 'true'
_AIRFLOW_WWW_USER_CREATE: 'false' # Set to false as we handle it manually
_PIP_ADDITIONAL_REQUIREMENTS: ''
user: "0:0"
airflow-cli:
<<: *airflow-common
profiles:
- debug
environment:
<<: *airflow-common-env
CONNECTION_CHECK_MAX_COUNT: "0"
# Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
command:
- bash
- -c
- airflow
# You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
# or by explicitly targeted on the command line e.g. docker-compose up flower.
# See: https://docs.docker.com/compose/profiles/
flower:
<<: *airflow-common
command: celery flower
ports:
- "5555:5555"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
docker-socket-proxy:
profiles:
- disabled
image: tecnativa/docker-socket-proxy:0.1.1
networks:
- proxynet
environment:
CONTAINERS: 1
IMAGES: 1
AUTH: 1
POST: 1
privileged: true
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
restart: always
volumes:
postgres-db-volume:
networks:
proxynet:
name: airflow_proxynet
external: true

View File

@ -0,0 +1,96 @@
name: ytdlp-ops
include:
# This automatically includes the generated camoufox service definitions and dependencies.
# It simplifies the docker-compose command, as you no longer need to specify both files with -f.
# The file is generated by the config-generator service and will be created even if empty.
- docker-compose.camoufox.yaml
services:
envoy:
image: envoyproxy/envoy:v1.29-latest
container_name: envoy-thrift-lb
restart: unless-stopped
volumes:
# Mount the generated config file from the host
- ./envoy.yaml:/etc/envoy/envoy.yaml:ro
ports:
# This is the single public port for all Thrift traffic
- "${ENVOY_PORT:-9080}:${ENVOY_PORT:-9080}"
# Expose the admin port for debugging
- "${ENVOY_ADMIN_PORT:-9901}:${ENVOY_ADMIN_PORT:-9901}"
networks:
- proxynet
# This service depends on ytdlp-ops-service, which in turn waits for camoufox.
depends_on:
- ytdlp-ops-service
ytdlp-ops-service:
image: pangramia/ytdlp-ops-server:latest # Don't comment out or remove, build is performed externally
# container_name is omitted; Docker will use the service name for DNS.
# This service depends on the 'camoufox-group' service, which is defined in the
# generated docker-compose.camoufox.yaml file. This ensures all camoufox
# instances are started before this service.
depends_on:
- camoufox-group
# Ports are no longer exposed directly. Envoy will connect to them on the internal network.
env_file:
- ./.env # Path is relative to the compose file
volumes:
- context-data:/app/context-data
# Mount the generated config directory to make endpoints available to the server
- ./config:/app/config:ro
# Mount the plugin source code for live updates without rebuilding the image.
# Assumes the plugin source is in a 'bgutil-ytdlp-pot-provider' directory
# next to your docker-compose.yaml file.
#- ./bgutil-ytdlp-pot-provider:/app/bgutil-ytdlp-pot-provider
networks:
- proxynet
command:
# --- Parameters for ALL service roles ---
- "--port"
- "${YTDLP_BASE_PORT:-9090}"
- "--timeout"
- "${YTDLP_TIMEOUT:-600}"
- "--workers"
- "${YTDLP_WORKERS:-3}"
- "--verbose"
- "--server-identity"
- "${SERVER_IDENTITY:-ytdlp-ops-airflow-service}"
- "--redis-host"
- "${REDIS_HOST:-redis}"
- "--redis-port"
- "${REDIS_PORT:-6379}"
- "--redis-password"
- "${REDIS_PASSWORD}"
- "--account-active-duration-min"
- "${ACCOUNT_ACTIVE_DURATION_MIN:-30}"
- "--account-cooldown-duration-min"
- "${ACCOUNT_COOLDOWN_DURATION_MIN:-60}"
- "--service-role"
- "all-in-one"
# --- Parameters for worker/all-in-one roles ONLY ---
- "--script-dir"
- "/app"
- "--context-dir"
- "/app/context-data"
- "--clean-context-dir"
- "--clients"
- "${YT_CLIENTS:-web,mweb,ios,android}"
- "--proxies"
- "socks5://172.17.0.1:1087"
- "--camoufox-endpoints-file"
- "/app/config/camoufox_endpoints.json"
- "--print-tokens"
- "--stop-if-no-proxy"
restart: unless-stopped
pull_policy: always
volumes:
context-data:
name: context-data
networks:
proxynet:
name: airflow_proxynet
external: true

407
airflow/bgutil-diff.txt Normal file
View File

@ -0,0 +1,407 @@
Diff to getpot_bgutil_http
def _validate_get_pot(self, client: str, ydl: YoutubeDL, visitor_data=None, data_sync_id=None, player_url=None, **kwargs):
if client != 'ios':
raise UnsupportedRequest(f'Client {client} is not supported')
base_url = ydl.get_info_extractor('Youtube')._configuration_arg(
'getpot_bgutil_baseurl', ['http://127.0.0.1:4416'], casesense=True)[0]
# Validate visitor data format for ios client
if visitor_data and not visitor_data.startswith('Cg'):
raise UnsupportedRequest('Invalid visitor data format for ios client')
if not data_sync_id and not visitor_data:
raise UnsupportedRequest(
'One of [data_sync_id, visitor_data] must be passed')
>>>>>>> 559b875 (feat: Add support for pre-provided ios PO tokens and client-specific validation)
try:
self.logger.trace(
f'Checking server availability at {self._base_url}/ping')
response = json.load(self._request_webpage(Request(
f'{self._base_url}/ping', extensions={'timeout': self._GET_SERVER_VSN_TIMEOUT}, proxies={'all': None}),
note=False))
except TransportError as e:
# the server may be down
script_path_provided = self.ie._configuration_arg(
ie_key='youtubepot-bgutilscript', key='script_path', default=[None])[0] is not None
warning_base = f'Error reaching GET {self._base_url}/ping (caused by {e.__class__.__name__}). '
if script_path_provided: # server down is expected, log info
self._info_and_raise(
warning_base + 'This is expected if you are using the script method.')
else:
self._warn_and_raise(
warning_base + f'Please make sure that the server is reachable at {self._base_url}.')
return
except HTTPError as e:
# may be an old server, don't raise
self.logger.warning(
f'HTTP Error reaching GET /ping (caused by {e!r})', once=True)
return
except json.JSONDecodeError as e:
# invalid server
self._warn_and_raise(
f'Error parsing ping response JSON (caused by {e!r})')
return
except Exception as e:
self._warn_and_raise(
f'Unknown error reaching GET /ping (caused by {e!r})', raise_from=e)
return
else:
self._check_version(response.get('version', ''), name='HTTP server')
self._server_available = True
return True
finally:
self._last_server_check = time.time()
<<<<<<< HEAD
def is_available(self):
return self._server_available or self._last_server_check + 60 < int(time.time())
def _real_request_pot(
self,
request: PoTokenRequest,
) -> PoTokenResponse:
if not self._check_server_availability(request):
raise PoTokenProviderRejectedRequest(
f'{self.PROVIDER_NAME} server is not available')
# used for CI check
self.logger.trace('Generating POT via HTTP server')
=======
def _validate_get_pot(self, client: str, ydl: YoutubeDL, visitor_data=None, data_sync_id=None, player_url=None, **kwargs):
if client != 'ios':
raise UnsupportedRequest(f'Client {client} is not supported')
base_url = ydl.get_info_extractor('Youtube')._configuration_arg(
'getpot_bgutil_baseurl', ['http://127.0.0.1:4416'], casesense=True)[0]
# Validate visitor data format for ios client
if visitor_data and not visitor_data.startswith('Cg'):
raise UnsupportedRequest('Invalid visitor data format for ios client')
if not data_sync_id and not visitor_data:
raise UnsupportedRequest(
'One of [data_sync_id, visitor_data] must be passed')
>>>>>>> 559b875 (feat: Add support for pre-provided ios PO tokens and client-specific validation)
try:
self.logger.trace(
f'Checking server availability at {self._base_url}/ping')
response = json.load(self._request_webpage(Request(
f'{self._base_url}/ping', extensions={'timeout': self._GET_SERVER_VSN_TIMEOUT}, proxies={'all': None}),
note=False))
except TransportError as e:
# the server may be down
script_path_provided = self.ie._configuration_arg(
ie_key='youtubepot-bgutilscript', key='script_path', default=[None])[0] is not None
warning_base = f'Error reaching GET {self._base_url}/ping (caused by {e.__class__.__name__}). '
if script_path_provided: # server down is expected, log info
self._info_and_raise(
warning_base + 'This is expected if you are using the script method.')
else:
self._warn_and_raise(
warning_base + f'Please make sure that the server is reachable at {self._base_url}.')
return
except HTTPError as e:
# may be an old server, don't raise
self.logger.warning(
f'HTTP Error reaching GET /ping (caused by {e!r})', once=True)
return
except json.JSONDecodeError as e:
# invalid server
self._warn_and_raise(
f'Error parsing ping response JSON (caused by {e!r})')
return
except Exception as e:
self._warn_and_raise(
f'Unknown error reaching GET /ping (caused by {e!r})', raise_from=e)
return
else:
self._check_version(response.get('version', ''), name='HTTP server')
self._server_available = True
return True
finally:
self._last_server_check = time.time()
<<<<<<< HEAD
def is_available(self):
return self._server_available or self._last_server_check + 60 < int(time.time())
def _real_request_pot(
self,
request: PoTokenRequest,
) -> PoTokenResponse:
if not self._check_server_availability(request):
raise PoTokenProviderRejectedRequest(
f'{self.PROVIDER_NAME} server is not available')
# used for CI check
self.logger.trace('Generating POT via HTTP server')
=======
def _get_pot(self, client: str, ydl: YoutubeDL, visitor_data=None, data_sync_id=None, player_url=None, **kwargs) -> str:
# Check if we have a pre-provided token
if client == 'ios' and kwargs.get('po_token'):
self._logger.info('Using provided ios PO token')
return kwargs['po_token']
self._logger.info(f'Generating POT via HTTP server for {client} client')
if ((proxy := select_proxy('https://jnn-pa.googleapis.com', self.proxies))
!= select_proxy('https://youtube.com', self.proxies)):
self._logger.warning(
'Proxies for https://youtube.com and https://jnn-pa.googleapis.com are different. '
'This is likely to cause subsequent errors.')
>>>>>>> 559b875 (feat: Add support for pre-provided ios PO tokens and client-specific validation)
try:
response = self._request_webpage(
request=Request(
f'{self._base_url}/get_pot', data=json.dumps({
'content_binding': get_webpo_content_binding(request)[0],
'proxy': request.request_proxy,
'bypass_cache': request.bypass_cache,
'source_address': request.request_source_address,
'disable_tls_verification': not request.request_verify_tls,
}).encode(), headers={'Content-Type': 'application/json'},
extensions={'timeout': self._GETPOT_TIMEOUT}, proxies={'all': None}),
note=f'Generating a {request.context.value} PO Token for '
f'{request.internal_client_name} client via bgutil HTTP server',
)
except Exception as e:
raise PoTokenProviderError(
f'Error reaching POST /get_pot (caused by {e!r})') from e
try:
response_json = json.load(response)
except Exception as e:
raise PoTokenProviderError(
f'Error parsing response JSON (caused by {e!r}). response = {response.read().decode()}') from e
if error_msg := response_json.get('error'):
raise PoTokenProviderError(error_msg)
if 'poToken' not in response_json:
raise PoTokenProviderError(
f'Server did not respond with a poToken. Received response: {json.dumps(response_json)}')
po_token = response_json['poToken']
self.logger.trace(f'Generated POT: {po_token}')
return PoTokenResponse(po_token=po_token)
@register_preference(BgUtilHTTPPTP)
def bgutil_HTTP_getpot_preference(provider, request):
return 100
__all__ = [BgUtilHTTPPTP.__name__,
bgutil_HTTP_getpot_preference.__name__]
-------------------------
Diff to getpot_bgutil_script.py
from __future__ import annotations
import contextlib
import functools
import json
import os.path
import re
import shutil
import subprocess
from yt_dlp.extractor.youtube.pot.utils import get_webpo_content_binding
from yt_dlp.utils import Popen
with contextlib.suppress(ImportError):
from yt_dlp_plugins.extractor.getpot_bgutil import BgUtilPTPBase
from yt_dlp.extractor.youtube.pot.provider import (
PoTokenProviderError,
PoTokenRequest,
PoTokenResponse,
register_preference,
register_provider,
)
@register_provider
class BgUtilScriptPTP(BgUtilPTPBase):
PROVIDER_NAME = 'bgutil:script'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._check_script = functools.cache(self._check_script_impl)
@functools.cached_property
def _script_path(self):
script_path = self._configuration_arg(
'script_path', casesense=True, default=[None])[0]
if script_path:
return os.path.expandvars(script_path)
# check deprecated arg
deprecated_script_path = self.ie._configuration_arg(
ie_key='youtube', key='getpot_bgutil_script', default=[None])[0]
if deprecated_script_path:
self._warn_and_raise(
"'youtube:getpot_bgutil_script' extractor arg is deprecated, use 'youtubepot-bgutilscript:script_path' instead")
# default if no arg was passed
home = os.path.expanduser('~')
default_path = os.path.join(
home, 'bgutil-ytdlp-pot-provider', 'server', 'build', 'generate_once.js')
self.logger.debug(
f'No script path passed, defaulting to {default_path}')
return default_path
<<<<<<< HEAD
def is_available(self):
return self._check_script(self._script_path)
@functools.cached_property
def _node_path(self):
node_path = shutil.which('node')
if node_path is None:
self.logger.trace('node is not in PATH')
vsn = self._check_node_version(node_path)
if vsn:
self.logger.trace(f'Node version: {vsn}')
return node_path
def _check_script_impl(self, script_path):
=======
def _validate_get_pot(self, client: str, ydl: YoutubeDL, visitor_data=None, data_sync_id=None, player_url=None, **kwargs):
script_path = ydl.get_info_extractor('Youtube')._configuration_arg(
'getpot_bgutil_script', [self._default_script_path], casesense=True)[0]
# If a specific client is requested, validate it's supported
requested_client = ydl.params.get('extractor_args', {}).get('youtube', {}).get('formats')
if requested_client and client != requested_client:
raise UnsupportedRequest(f'Skipping {client} as {requested_client} was specifically requested')
if not data_sync_id and not visitor_data:
raise UnsupportedRequest(
'One of [data_sync_id, visitor_data] must be passed')
>>>>>>> 046a994 (refactor: support client-specific requests via extractor_args in POT providers)
if not os.path.isfile(script_path):
self.logger.debug(
f"Script path doesn't exist: {script_path}")
return False
if os.path.basename(script_path) != 'generate_once.js':
self.logger.warning(
'Incorrect script passed to extractor args. Path to generate_once.js required', once=True)
return False
node_path = self._node_path
if not node_path:
return False
stdout, stderr, returncode = Popen.run(
[self._node_path, script_path, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True,
timeout=self._GET_SERVER_VSN_TIMEOUT)
if returncode:
self.logger.warning(
f'Failed to check script version. '
f'Script returned {returncode} exit status. '
f'Script stdout: {stdout}; Script stderr: {stderr}',
once=True)
return False
else:
self._check_version(stdout.strip(), name='script')
return True
def _check_node_version(self, node_path):
try:
stdout, stderr, returncode = Popen.run(
[node_path, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True,
timeout=self._GET_SERVER_VSN_TIMEOUT)
stdout = stdout.strip()
mobj = re.match(r'v(\d+)\.(\d+)\.(\d+)', stdout)
if returncode or not mobj:
raise ValueError
node_vsn = tuple(map(int, mobj.groups()))
if node_vsn >= self._MIN_NODE_VSN:
return node_vsn
raise RuntimeError
except RuntimeError:
min_vsn_str = 'v' + '.'.join(str(v) for v in self._MIN_NODE_VSN)
self.logger.warning(
f'Node version too low. '
f'(got {stdout}, but at least {min_vsn_str} is required)')
except (subprocess.TimeoutExpired, ValueError):
self.logger.warning(
f'Failed to check node version. '
f'Node returned {returncode} exit status. '
f'Node stdout: {stdout}; Node stderr: {stderr}')
def _real_request_pot(
self,
request: PoTokenRequest,
) -> PoTokenResponse:
# used for CI check
self.logger.trace(
f'Generating POT via script: {self._script_path}')
command_args = [self._node_path, self._script_path]
if proxy := request.request_proxy:
command_args.extend(['-p', proxy])
command_args.extend(['-c', get_webpo_content_binding(request)[0]])
if request.bypass_cache:
command_args.append('--bypass-cache')
if request.request_source_address:
command_args.extend(
['--source-address', request.request_source_address])
if request.request_verify_tls is False:
command_args.append('--disable-tls-verification')
self.logger.info(
f'Generating a {request.context.value} PO Token for '
f'{request.internal_client_name} client via bgutil script',
)
self.logger.debug(
f'Executing command to get POT via script: {" ".join(command_args)}')
try:
stdout, stderr, returncode = Popen.run(
command_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True,
timeout=self._GETPOT_TIMEOUT)
except subprocess.TimeoutExpired as e:
raise PoTokenProviderError(
f'_get_pot_via_script failed: Timeout expired when trying to run script (caused by {e!r})')
except Exception as e:
raise PoTokenProviderError(
f'_get_pot_via_script failed: Unable to run script (caused by {e!r})') from e
msg = f'stdout:\n{stdout.strip()}'
if stderr.strip(): # Empty strings are falsy
msg += f'\nstderr:\n{stderr.strip()}'
self.logger.trace(msg)
if returncode:
raise PoTokenProviderError(
f'_get_pot_via_script failed with returncode {returncode}')
try:
# The JSON response is always the last line
script_data_resp = json.loads(stdout.splitlines()[-1])
except json.JSONDecodeError as e:
raise PoTokenProviderError(
f'Error parsing JSON response from _get_pot_via_script (caused by {e!r})') from e
if 'poToken' not in script_data_resp:
raise PoTokenProviderError(
'The script did not respond with a po_token')
return PoTokenResponse(po_token=script_data_resp['poToken'])
@register_preference(BgUtilScriptPTP)
def bgutil_script_getpot_preference(provider, request):
return 1
__all__ = [BgUtilScriptPTP.__name__,
bgutil_script_getpot_preference.__name__]

@ -0,0 +1 @@
Subproject commit c79e8dc48151c8dd7c0349b85ada2ccfcdfeb75b

View File

@ -55,7 +55,8 @@ RUN conda init bash && \
conda create -n camo python=3.11 -y conda create -n camo python=3.11 -y
# Install Python dependencies in conda environment # Install Python dependencies in conda environment
RUN conda run -n camo pip install --no-cache-dir "camoufox[geoip]" playwright==1.49 COPY requirements.txt .
RUN conda run -n camo pip install --no-cache-dir -r requirements.txt
# Install Playwright browsers for version 1.49 # Install Playwright browsers for version 1.49
RUN conda run -n camo playwright install --with-deps RUN conda run -n camo playwright install --with-deps

View File

@ -0,0 +1,452 @@
#!/usr/bin/env python3
import re
import argparse
import atexit
import shutil
import logging
import sys
import os
import psutil
import time
import threading
import signal
import asyncio
import websockets
from collections import deque, defaultdict
from datetime import datetime, timedelta
from camoufox.server import launch_server
# Global variables for resource tracking
active_connections = defaultdict(int) # Track connections per endpoint
max_connections = defaultdict(int)
resource_stats = {}
server_instances = {} # Track multiple server instances
shutdown_requested = False
endpoint_locks = defaultdict(threading.Lock) # Locks for each endpoint
memory_restart_threshold = 1800 # MB - warn when exceeded
restart_in_progress = False
# Enhanced monitoring metrics
connection_pool_metrics = {
'total_acquired': 0,
'total_released': 0,
'total_reused': 0,
'pool_size': 0,
'active_contexts': 0
}
def parse_proxy_url(url):
"""Parse proxy URL in format proto://user:pass@host:port"""
pattern = r'([^:]+)://(?:([^:]+):([^@]+)@)?([^:]+):(\d+)'
match = re.match(pattern, url)
if not match:
raise ValueError('Invalid proxy URL format. Expected proto://[user:pass@]host:port')
proto, username, password, host, port = match.groups()
# Ensure username and password are strings, not None
proxy_config = {
'server': f'{proto}://{host}:{port}',
'username': username or '',
'password': password or ''
}
# Remove empty credentials
if not proxy_config['username']:
del proxy_config['username']
if not proxy_config['password']:
del proxy_config['password']
return proxy_config
def monitor_resources(server_ports, proxy_url):
"""Monitor system resources and log warnings when thresholds are exceeded"""
global active_connections, max_connections, resource_stats, shutdown_requested, restart_in_progress
global connection_pool_metrics
logging.info(f"Resource monitor started for proxy '{proxy_url}' on ports {server_ports}")
log_counter = 0
while not shutdown_requested:
log_counter += 1
try:
# Get system resource usage
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
memory_percent = memory.percent
# Get current process info
current_process = psutil.Process()
process_memory = current_process.memory_info()
process_cpu = current_process.cpu_percent()
# Update active connections using psutil
all_connections = current_process.net_connections(kind='inet')
new_active_connections = defaultdict(int)
for conn in all_connections:
if conn.status == psutil.CONN_ESTABLISHED and conn.laddr.port in server_ports:
new_active_connections[conn.laddr.port] += 1
active_connections.clear()
active_connections.update(new_active_connections)
for port, count in active_connections.items():
max_connections[port] = max(max_connections.get(port, 0), count)
connection_pool_metrics['active_contexts'] = sum(active_connections.values())
# Update resource stats
resource_stats = {
'cpu_percent': cpu_percent,
'memory_percent': memory_percent,
'process_memory_mb': process_memory.rss / 1024 / 1024,
'process_cpu_percent': process_cpu,
'total_active_connections': sum(active_connections.values()),
'active_connections_per_endpoint': dict(active_connections),
'max_connections': dict(max_connections),
'connection_pool_metrics': dict(connection_pool_metrics)
}
# Log resource usage periodically
if cpu_percent > 80 or memory_percent > 80:
logging.info(f"RESOURCE STATS - CPU: {cpu_percent}%, Memory: {memory_percent}%, "
f"Process Memory: {resource_stats['process_memory_mb']:.1f}MB, "
f"Total Active Connections: {resource_stats['total_active_connections']}")
# Log connection pool metrics
pool_metrics = resource_stats['connection_pool_metrics']
logging.info(f"POOL METRICS - Acquired: {pool_metrics['total_acquired']}, "
f"Released: {pool_metrics['total_released']}, "
f"Reused: {pool_metrics['total_reused']}, "
f"Pool Size: {pool_metrics['pool_size']}, "
f"Active Contexts: {pool_metrics['active_contexts']}")
# Warning thresholds
if cpu_percent > 85:
logging.warning(f"HIGH CPU USAGE: {cpu_percent}%")
if memory_percent > 85:
logging.warning(f"HIGH MEMORY USAGE: {memory_percent}%")
if resource_stats['total_active_connections'] > 100:
logging.warning(f"HIGH TOTAL CONNECTION COUNT: {resource_stats['total_active_connections']} active connections")
if process_memory.rss > 2 * 1024 * 1024 * 1024: # 2GB
logging.warning(f"HIGH PROCESS MEMORY: {process_memory.rss / 1024 / 1024:.1f}MB")
# Safety net: Warn instead of restart if memory exceeds threshold
if resource_stats['process_memory_mb'] > memory_restart_threshold:
logging.warning(f"MEMORY THRESHOLD EXCEEDED: {resource_stats['process_memory_mb']}MB > {memory_restart_threshold}MB")
logging.warning("Manual intervention required - memory usage critical but restart disabled")
logging.warning("Consider adding new camoufox instances or reducing concurrent workers")
# Add metric for monitoring instead of restart
logging.info(f"MEMORY_ALERT: {resource_stats['process_memory_mb']}MB used on {sum(active_connections.values())} active connections")
# Add a heartbeat log every minute (30s * 2)
if log_counter % 2 == 0:
logging.info(
f"HEARTBEAT - Proxy: {proxy_url} | Ports: {server_ports} | "
f"Memory: {resource_stats.get('process_memory_mb', 0):.1f}MB | "
f"CPU: {resource_stats.get('cpu_percent', 0)}% | "
f"Active Connections: {resource_stats.get('total_active_connections', 0)}"
)
except Exception as e:
logging.error(f"Error in resource monitoring: {e}")
time.sleep(30) # Check every 30 seconds
def graceful_shutdown(signum, frame):
"""Handle graceful shutdown"""
global shutdown_requested, server_instances, restart_in_progress
logging.info("Graceful shutdown requested")
shutdown_requested = True
# Log final resource stats
if resource_stats:
logging.info(f"Final resource stats: {resource_stats}")
# Log final connection pool metrics
logging.info(f"Final connection pool metrics: {connection_pool_metrics}")
# The server instances are running in daemon threads and will be terminated
# when the main process exits. No explicit shutdown call is needed.
logging.info("Shutting down all Camoufox server instances...")
# If restart was requested, exit with special code
if restart_in_progress:
logging.info("Restarting Camoufox server...")
os.execv(sys.executable, [sys.executable] + sys.argv)
sys.exit(0)
def create_server_instance(port, base_config):
"""
Creates and runs a new Camoufox server instance on the specified port.
NOTE: The `launch_server` function is a blocking call that runs an event loop
and does not return. Therefore, any code after it in this function is unreachable.
"""
config = base_config.copy()
config['port'] = port
try:
# This function blocks and runs the server indefinitely.
launch_server(**config)
except Exception as e:
# If an error occurs, log it. The daemon thread will then terminate.
logging.error(f'Error launching server on port {port}: {str(e)}', exc_info=True)
def check_listening_ports(expected_ports, log_results=True):
"""Checks which of the expected ports are actively listening."""
successful_ports = []
failed_ports = []
try:
# Check all system-wide connections, not just for the current process,
# as the server may run in a child process.
listening_ports = {
conn.laddr.port for conn in psutil.net_connections(kind='inet')
if conn.status == psutil.CONN_LISTEN
}
for port in expected_ports:
if port in listening_ports:
successful_ports.append(port)
else:
failed_ports.append(port)
if log_results:
logging.info("--- Verifying Listening Ports ---")
if successful_ports:
logging.info(f"Successfully listening on ports: {sorted(successful_ports)}")
if failed_ports:
logging.error(f"FAILED to listen on ports: {sorted(failed_ports)}")
logging.info("---------------------------------")
except Exception as e:
if log_results:
logging.error(f"Could not verify listening ports: {e}")
return successful_ports, failed_ports
def main():
parser = argparse.ArgumentParser(description='Launch Camoufox server with optional proxy support')
parser.add_argument('--proxy-url', help='Optional proxy URL in format proto://user:pass@host:port (supports http, https, socks5)')
parser.add_argument('--ws-host', default='0.0.0.0', help='WebSocket server host address (e.g., localhost, 0.0.0.0)')
parser.add_argument('--port', type=int, default=12345, help='Base WebSocket server port')
parser.add_argument('--num-instances', type=int, default=4, help='Number of server instances to create')
parser.add_argument('--port-range', type=str, help='Port range in format start-end (e.g., 12345-12349)')
parser.add_argument('--base-proxy-port', type=int, default=1080, help='Base proxy port for mapping to camoufox instances')
parser.add_argument('--ws-path', default='camoufox', help='Base WebSocket server path')
parser.add_argument('--headless', action='store_true', help='Run browser in headless mode')
parser.add_argument('--geoip', nargs='?', const=True, default=False,
help='Enable geo IP protection. Can specify IP address or use True for automatic detection')
parser.add_argument('--locale', help='Locale(s) to use (e.g. "en-US" or "en-US,fr-FR")')
parser.add_argument('--block-images', action='store_true', help='Block image requests to save bandwidth')
parser.add_argument('--block-webrtc', action='store_true', help='Block WebRTC entirely')
parser.add_argument('--humanize', nargs='?', const=True, type=float,
help='Humanize cursor movements. Can specify max duration in seconds')
parser.add_argument('--extensions', type=str,
help='Comma-separated list of extension paths to enable (XPI files or extracted directories). Use quotes if paths contain spaces.')
parser.add_argument('--persistent-context', action='store_true', help='Enable persistent browser context.')
parser.add_argument('--user-data-dir', type=str, help='Directory to store persistent browser data.')
parser.add_argument('--preferences', type=str, help='Comma-separated list of Firefox preferences (e.g. "key1=value1,key2=value2")')
# Add resource monitoring arguments
parser.add_argument('--monitor-resources', action='store_true', help='Enable resource monitoring')
parser.add_argument('--max-connections-per-instance', type=int, default=50, help='Maximum concurrent connections per instance')
parser.add_argument('--connection-timeout', type=int, default=300, help='Connection timeout in seconds')
parser.add_argument('--memory-restart-threshold', type=int, default=1800, help='Memory threshold (MB) to trigger warning')
args = parser.parse_args()
# Set memory restart threshold
global memory_restart_threshold
memory_restart_threshold = args.memory_restart_threshold
# Set up signal handlers for graceful shutdown
signal.signal(signal.SIGTERM, graceful_shutdown)
signal.signal(signal.SIGINT, graceful_shutdown)
proxy_config = None
if args.proxy_url:
try:
proxy_config = parse_proxy_url(args.proxy_url)
print(f"Using proxy configuration: {args.proxy_url}")
except ValueError as e:
print(f'Error parsing proxy URL: {e}')
return
else:
print("No proxy URL provided. Running without proxy.")
# --- Basic Logging Configuration ---
log_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
log_handler = logging.StreamHandler(sys.stdout)
log_handler.setFormatter(log_formatter)
root_logger = logging.getLogger()
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
root_logger.addHandler(log_handler)
root_logger.setLevel(logging.DEBUG)
logging.debug("DEBUG logging enabled. Starting Camoufox server setup...")
# --- End Logging Configuration ---
try:
# --- Check DISPLAY environment variable ---
display_var = os.environ.get('DISPLAY')
logging.info(f"Value of DISPLAY environment variable: {display_var}")
# --- End Check ---
# Build base config dictionary
base_config = {
'headless': False, # Force non-headless mode for VNC
'geoip': True, # Always enable GeoIP when a proxy is used
'host': args.ws_host,
'ws_path': args.ws_path,
'env': {'DISPLAY': os.environ.get('DISPLAY')}
}
# Add proxy to config only if it was successfully parsed
if proxy_config:
base_config['proxy'] = proxy_config
# Add optional parameters
if args.locale:
base_config['locale'] = args.locale
if args.block_images:
base_config['block_images'] = True
if args.block_webrtc:
base_config['block_webrtc'] = True
if args.humanize:
base_config['humanize'] = args.humanize if isinstance(args.humanize, float) else True
# Add persistent context options
if args.persistent_context:
base_config['persistent_context'] = True
if args.user_data_dir:
base_config['user_data_dir'] = args.user_data_dir
# Add Firefox preferences
if args.preferences:
base_config['preferences'] = {}
prefs_list = args.preferences.split(',')
for pref in prefs_list:
if '=' in pref:
key, value = pref.split('=', 1)
if value.lower() in ('true', 'false'):
base_config['preferences'][key.strip()] = value.lower() == 'true'
elif value.isdigit():
base_config['preferences'][key.strip()] = int(value)
else:
base_config['preferences'][key.strip()] = value.strip()
print(f"Applied Firefox preferences: {base_config['preferences']}")
# Exclude default addons including uBlock Origin
base_config['exclude_addons'] = ['ublock_origin', 'default_addons']
print('Excluded default addons including uBlock Origin')
# Add custom extensions if specified
if args.extensions:
from pathlib import Path
valid_extensions = []
extensions_list = [ext.strip() for ext in args.extensions.split(',')]
temp_dirs_to_cleanup = []
def cleanup_temp_dirs():
for temp_dir in temp_dirs_to_cleanup:
try:
shutil.rmtree(temp_dir)
print(f"Cleaned up temporary extension directory: {temp_dir}")
except Exception as e:
print(f"Warning: Failed to clean up temp dir {temp_dir}: {e}")
atexit.register(cleanup_temp_dirs)
for ext_path in extensions_list:
ext_path = Path(ext_path).absolute()
if not ext_path.exists():
print(f"Warning: Extension path does not exist: {ext_path}")
continue
if ext_path.is_file() and ext_path.suffix == '.xpi':
import tempfile
import zipfile
try:
temp_dir = tempfile.mkdtemp(prefix=f"camoufox_ext_{ext_path.stem}_")
temp_dirs_to_cleanup.append(temp_dir)
with zipfile.ZipFile(ext_path, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
valid_extensions.append(temp_dir)
print(f"Successfully loaded extension: {ext_path.name} (extracted to {temp_dir})")
except Exception as e:
print(f"Error loading extension {ext_path}: {str(e)}")
if temp_dir in temp_dirs_to_cleanup:
temp_dirs_to_cleanup.remove(temp_dir)
continue
elif ext_path.is_dir():
if (ext_path / 'manifest.json').exists():
valid_extensions.append(str(ext_path))
print(f"Successfully loaded extension: {ext_path.name}")
else:
print(f"Warning: Directory is not a valid Firefox extension: {ext_path}")
else:
print(f"Warning: Invalid extension path: {ext_path}")
if valid_extensions:
base_config['addons'] = valid_extensions
print(f"Loaded {len(valid_extensions)} extensions")
else:
print("Warning: No valid extensions were loaded")
# Create multiple server instances
ports_to_create = []
if args.port_range:
start_port, end_port = map(int, args.port_range.split('-'))
ports_to_create = list(range(start_port, end_port + 1))
else:
# Create instances starting from base port
ports_to_create = [args.port + i for i in range(args.num_instances)]
# Start resource monitoring thread if enabled, passing it the ports to watch.
if args.monitor_resources:
# Pass the proxy URL to the monitor for more descriptive logging
monitor_thread = threading.Thread(target=monitor_resources, args=(ports_to_create, args.proxy_url), daemon=True)
monitor_thread.start()
print(f"Attempting to launch {len(ports_to_create)} Camoufox server instances on ports: {ports_to_create}")
for port in ports_to_create:
# launch_server is blocking, so we run each instance in its own thread.
thread = threading.Thread(target=create_server_instance, args=(port, base_config), daemon=True)
thread.start()
# Add a small delay between launching instances to avoid race conditions
# in the underlying Playwright/Camoufox library.
time.sleep(1)
# The script's main purpose is now to launch the daemon threads and then wait.
# The actual readiness is determined by the start_camoufox.sh script.
print("Server threads launched. Main process will now wait for shutdown signal.")
# Log startup resource usage
process = psutil.Process()
memory_info = process.memory_info()
logging.info(f"Server started. Initial memory usage: {memory_info.rss / 1024 / 1024:.1f}MB")
# Keep the main thread alive to host the daemon threads and handle shutdown signals
try:
while not shutdown_requested:
time.sleep(1)
except KeyboardInterrupt:
logging.info("Received KeyboardInterrupt, shutting down...")
except Exception as e:
print(f'Error launching server: {str(e)}')
logging.error(f'Error launching server: {str(e)}', exc_info=True)
if 'Browser.setBrowserProxy' in str(e):
print('Note: The browser may not support SOCKS5 proxy authentication')
return
if __name__ == '__main__':
main()

View File

@ -0,0 +1,4 @@
camoufox[geoip]
playwright==1.49
psutil
websockets

View File

@ -0,0 +1,102 @@
#!/bin/bash
set -e
# Global PIDs for cleanup
VNC_PID=""
FLUXBOX_PID=""
# Cleanup function to terminate background processes on script exit
cleanup() {
echo "Cleaning up background processes..."
# Kill processes in reverse order of startup. The '|| true' prevents errors if a process is already dead.
if [ -n "$FLUXBOX_PID" ]; then kill -TERM $FLUXBOX_PID 2>/dev/null || true; fi
if [ -n "$VNC_PID" ]; then kill -TERM $VNC_PID 2>/dev/null || true; fi
echo "Cleanup complete."
}
trap cleanup EXIT
# Xvfb is now started by xvfb-run in the Dockerfile ENTRYPOINT.
# The DISPLAY variable will be set automatically by xvfb-run.
# It's safer to source conda.sh directly
source /opt/conda/etc/profile.d/conda.sh
conda activate camo
# Ensure the persistent data directory exists before we try to use it for the lock file.
mkdir -p /app/persistent-data
# --- One-time Initialization ---
# On first launch, multiple instances starting at once can cause a race condition
# during the download/extraction of the Camoufox distribution. To prevent this,
# we run a single dummy instance first, wait for it to become healthy (which
# indicates setup is complete), and then kill it. A lock file ensures this
# only happens on the very first start of the container.
INIT_LOCK_FILE="/app/persistent-data/camoufox.initialized"
if [ ! -f "$INIT_LOCK_FILE" ]; then
echo "First start detected. Performing one-time Camoufox initialization..."
# Start a single dummy instance in the background, logging to a file.
# It will perform the necessary downloads and setup.
INIT_LOG="/tmp/camoufox_init.log"
rm -f "$INIT_LOG" # Ensure log file is clean before starting
python3 -u camoufox_server.py --port 9999 --num-instances 1 > "$INIT_LOG" 2>&1 &
INIT_PID=$!
# Wait for the server to log that it's started, which is a reliable signal
# that all one-time downloads and setup tasks are complete.
echo "Waiting for initialization to complete (max 120s)..."
end_time=$((SECONDS + 120))
INIT_SUCCESS=false
while [ $SECONDS -lt $end_time ]; do
# The camoufox library logs "Websocket endpoint:" when it's ready.
# This is a more reliable signal than a custom log message from our script.
if grep -q "Websocket endpoint: ws://0.0.0.0:9999" "$INIT_LOG"; then
INIT_SUCCESS=true
break
fi
# Also check if the initialization process died unexpectedly
if ! ps -p $INIT_PID > /dev/null; then
echo "Initialization process died unexpectedly."
break
fi
sleep 2
done
if [ "$INIT_SUCCESS" = true ]; then
echo "Initialization successful."
else
echo "Initialization timed out or failed. The main server might fail to start."
echo "--- Initialization Log ---"
cat "$INIT_LOG"
echo "--------------------------"
fi
# Cleanly terminate the dummy server.
echo "Shutting down initialization server..."
kill -TERM $INIT_PID
wait $INIT_PID 2>/dev/null || true # Wait for it to exit, ignore error code
# Create the lock file to prevent this from running again.
touch "$INIT_LOCK_FILE"
echo "Initialization complete. Proceeding with normal startup."
else
echo "Initialization already complete. Skipping."
fi
# --- End Initialization ---
# Start supporting services (VNC, window manager)
echo "Starting VNC server on port 5900..."
# The -noxdamage flag is added to improve compatibility with VNC clients like the one on macOS.
# The '-localhost no' part was likely a typo and has been removed as the default is to allow non-localhost connections.
x11vnc -forever -usepw -display $DISPLAY -rfbport 5900 -o /var/log/x11vnc.log -shared -noxdamage &
VNC_PID=$!
echo "Starting Fluxbox window manager..."
fluxbox > /var/log/fluxbox.log 2>&1 &
FLUXBOX_PID=$!
# Start main application
echo "Starting Camoufox server with arguments: $@"
exec python3 -u camoufox_server.py "$@"

BIN
airflow/config/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,24 @@
# Version: 2025-08-20-02
# This file contains custom hooks for the Airflow environment.
from airflow import settings
def task_instance_mutation_hook(ti):
if ti.dag_id == 'ytdlp_ops_worker_per_url':
# Safely access dag_run and conf. The ti.dag_run attribute may not be populated
# when the hook is called during TaskInstance creation.
dag_run = getattr(ti, 'dag_run', None)
conf = getattr(dag_run, 'conf', {}) if dag_run else {}
worker_queue = conf.get('worker_queue')
if worker_queue:
print(f"Mutating queue for task {ti.task_id} to {worker_queue} based on dag_run.conf")
ti.queue = worker_queue
else:
print(f"No worker_queue in conf for {ti.dag_id}. Falling back to 'queue-dl'")
ti.queue = 'queue-dl'
# Register the hook only in appropriate contexts
# This hook can cause issues with the Triggerer, which does not have a `dag_run` context
# when it runs its own maintenance tasks.
if not settings.CONFIG.get('core', 'executor').lower().startswith('debug'):
settings.task_instance_mutation_hook = task_instance_mutation_hook

View File

View File

@ -1,17 +1,17 @@
{ {
"minio_default": "minio_default":
{ {
"conn_type": "s3", "conn_type": "aws",
"host": "89.253.221.173", "host": "{% raw %}{{ hostvars[groups['airflow_master'][0]].ansible_host }}{% endraw %}",
"login": "admin", "login": "admin",
"password": "0153093693-0009", "password": "0153093693-0009",
"port": 9000, "port": 9000,
"extra": "extra":
{ {
"endpoint_url": "http://89.253.221.173:9000", "endpoint_url": "http://{% raw %}{{ hostvars[groups['airflow_master'][0]].ansible_host }}{% endraw %}:9000",
"aws_access_key_id": "admin", "aws_access_key_id": "admin",
"aws_secret_access_key": "0153093693-0009", "aws_secret_access_key": "0153093693-0009",
"region_name": "us-east-1" "region_name": "us-east-1"
} }
} }
} }

View File

@ -0,0 +1,10 @@
{
"redis_default":
{
"conn_type": "redis",
"host": "redis",
"port": 6379,
"password": "{{ vault_redis_password }}",
"extra": "{\"db\": 0}"
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,736 @@
import sys
import os
import time
import csv
import json
import logging
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Optional, Dict, Callable, Union
from threading import Event
from PyQt6.QtCore import Qt, QThread, pyqtSignal, QObject, QTimer
from PyQt6.QtWidgets import (
QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
QLabel, QLineEdit, QPushButton, QTextEdit, QSpinBox, QDoubleSpinBox,
QCheckBox, QGroupBox, QGridLayout, QMessageBox, QProgressBar, QDialog,
QComboBox, QFileDialog
)
# Define the current version of this tool.
CURRENT_VERSION = "1.3.0"
class ProxyChecker:
"""
Fetches proxy lists from given URLs and checks if they work.
Supports cancellation, pause/resume, progress reporting, and collects optional detailed
response times, anonymity classification, and geo-location details for working proxies.
"""
def __init__(self,
proxy_urls: Dict[str, str],
timeout: int = 1,
max_retries: int = 3,
retry_delay: float = 1.0,
max_workers: int = 20,
check_url: str = "http://www.google.com",
detailed_results: bool = False,
export_format: str = "txt", # or "csv" or "json"
user_agent: Optional[str] = None,
log_callback: Optional[Callable[[str], None]] = None,
progress_callback: Optional[Callable[[int], None]] = None):
self.proxy_urls = proxy_urls
self.timeout = timeout
self.max_retries = max_retries
self.retry_delay = retry_delay
self.max_workers = max_workers
self.check_url = check_url
self.detailed_results = detailed_results
self.export_format = export_format.lower()
self.user_agent = user_agent
self.log_callback = log_callback
self.progress_callback = progress_callback
self.cancel_event = Event()
self.pause_event = Event() # When set, processing is paused
# Statistics counters
self.total_proxies_checked = 0
self.working_proxies_found = 0
self.overall_total_count = 0
self.overall_processed_count = 0
# Store detailed working results by type.
self.working_results: Dict[str, List[Union[str, Dict[str, Union[str, float, dict]]]]] = {}
self.session = requests.Session()
if self.user_agent:
self.session.headers["User-Agent"] = self.user_agent
# Determine the client IP to help with anonymity detection.
try:
r = requests.get("https://api.ipify.org?format=json", timeout=3)
r.raise_for_status()
self.client_ip = r.json().get("ip")
self.log("info", f"Client IP determined as {self.client_ip}")
except requests.RequestException:
self.client_ip = "unknown"
self.log("warning", "Could not determine client IP for anonymity detection.")
def log(self, level: str, message: str) -> None:
full_message = f"{level.upper()}: {message}"
if self.log_callback:
self.log_callback(full_message)
else:
print(full_message)
def cancel(self) -> None:
self.cancel_event.set()
self.log("info", "Cancellation requested.")
def pause(self) -> None:
self.pause_event.set()
self.log("info", "Proxy checking paused.")
def resume(self) -> None:
self.pause_event.clear()
self.log("info", "Proxy checking resumed.")
def determine_anonymity(self, proxy: str) -> str:
try:
session = requests.Session()
session.proxies = {'http': proxy, 'https': proxy}
r = session.get("https://api.ipify.org?format=json", timeout=self.timeout)
r.raise_for_status()
proxy_ip = r.json().get("ip")
return "transparent" if proxy_ip == self.client_ip else "anonymous"
except requests.RequestException:
return "unknown"
def get_geo_info(self, ip: str) -> dict:
try:
r = requests.get(f"http://ip-api.com/json/{ip}", timeout=3)
r.raise_for_status()
return r.json()
except requests.RequestException:
return {}
def check_proxy(self, proxy: str) -> Optional[Union[str, dict]]:
if self.cancel_event.is_set():
return None
# If paused, wait until resumed.
while self.pause_event.is_set():
time.sleep(0.1)
try:
start = time.time()
session = requests.Session()
session.proxies = {'http': proxy, 'https': proxy}
if self.user_agent:
session.headers["User-Agent"] = self.user_agent
response = session.get(self.check_url, timeout=self.timeout)
elapsed = time.time() - start
if response.status_code == 200:
if self.detailed_results:
anonymity = self.determine_anonymity(proxy)
ip_only = proxy.split(':')[0]
geo = self.get_geo_info(ip_only)
return {
"proxy": proxy,
"response_time": elapsed,
"anonymity": anonymity,
"geo": geo
}
else:
return proxy
except requests.RequestException:
return None
def get_proxies(self, url: str) -> List[str]:
for attempt in range(self.max_retries):
if self.cancel_event.is_set():
self.log("info", "Cancellation detected while fetching proxies.")
return []
try:
response = self.session.get(url, timeout=self.timeout)
response.raise_for_status()
self.log("info", f"Successfully fetched proxies from {url}")
return response.text.strip().splitlines()
except requests.RequestException as e:
self.log("warning", f"Attempt {attempt + 1} failed for {url}: {e}")
time.sleep(self.retry_delay)
self.log("error", f"Failed to retrieve proxies from {url} after {self.max_retries} attempts.")
return []
@staticmethod
def create_proxy_dir(directory: str) -> None:
os.makedirs(directory, exist_ok=True)
def process_proxies(self,
proxy_type: str,
url: Optional[str] = None,
proxies: Optional[List[str]] = None) -> int:
if proxies is None and url is not None:
proxies = self.get_proxies(url)
if self.cancel_event.is_set():
self.log("info", "Cancellation detected before processing proxies.")
return 0
if not proxies:
self.log("warning", f"No proxies to check for {proxy_type}")
return 0
total_proxies = len(proxies)
self.log("info", f"Checking {total_proxies} {proxy_type} proxies with {self.max_workers} workers.")
working_proxy_list = []
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = {executor.submit(self.check_proxy, proxy): proxy for proxy in proxies}
for future in as_completed(futures):
while self.pause_event.is_set():
time.sleep(0.1)
if self.cancel_event.is_set():
self.log("info", "Cancellation detected during proxy checking loop.")
break
result = future.result()
self.overall_processed_count += 1
if self.progress_callback and self.overall_total_count > 0:
progress_percent = int((self.overall_processed_count / self.overall_total_count) * 100)
self.progress_callback(progress_percent)
if result:
working_proxy_list.append(result)
self.working_results[proxy_type] = working_proxy_list
file_ext = ".csv" if self.export_format == "csv" else ".json" if self.export_format == "json" else ".txt"
proxy_file = f'proxies/{proxy_type}{file_ext}'
self.create_proxy_dir(os.path.dirname(proxy_file))
try:
if self.export_format == "csv":
with open(proxy_file, 'w', newline='') as f:
if self.detailed_results:
writer = csv.writer(f)
writer.writerow(["Proxy", "Response Time (s)", "Anonymity", "Country", "Region", "City"])
for item in working_proxy_list:
geo = item.get("geo", {})
writer.writerow([
item.get("proxy"),
f"{item.get('response_time', 0):.2f}",
item.get("anonymity"),
geo.get("country", ""),
geo.get("regionName", ""),
geo.get("city", "")
])
else:
writer = csv.writer(f)
writer.writerow(["Proxy"])
for item in working_proxy_list:
writer.writerow([item])
elif self.export_format == "json":
with open(proxy_file, 'w') as f:
json.dump(working_proxy_list, f, indent=4)
else:
with open(proxy_file, 'w') as f:
if self.detailed_results:
lines = [
f"{item.get('proxy')} - {item.get('response_time'):.2f} s - {item.get('anonymity')} - {item.get('geo', {}).get('country', '')}"
for item in working_proxy_list
]
else:
lines = working_proxy_list
f.write('\n'.join(lines) + '\n')
except OSError as e:
self.log("error", f"Failed to write working proxies to {proxy_file}: {e}")
self.log("info", f"Checked {total_proxies} {proxy_type} proxies. Working: {len(working_proxy_list)}.")
self.total_proxies_checked += total_proxies
self.working_proxies_found += len(working_proxy_list)
return len(working_proxy_list)
def get_statistics(self) -> str:
stats = f"Total proxies checked: {self.total_proxies_checked}\n"
stats += f"Working proxies found: {self.working_proxies_found}\n"
if self.detailed_results:
all_times = []
for lst in self.working_results.values():
all_times.extend([item.get("response_time") for item in lst if isinstance(item, dict)])
if all_times:
avg_time = sum(all_times) / len(all_times)
stats += f"Average response time: {avg_time:.2f} seconds\n"
return stats
def run(self) -> None:
start_time = time.time()
self.overall_total_count = 0
self.overall_processed_count = 0
proxies_by_type: Dict[str, List[str]] = {}
for proxy_type, url in self.proxy_urls.items():
if self.cancel_event.is_set():
self.log("info", "Cancellation detected. Aborting processing.")
return
proxies = self.get_proxies(url)
proxies_by_type[proxy_type] = proxies
self.overall_total_count += len(proxies)
if self.overall_total_count == 0:
self.log("warning", "No proxies fetched from any source.")
for proxy_type, proxies in proxies_by_type.items():
if self.cancel_event.is_set():
self.log("info", "Cancellation detected. Aborting further processing.")
break
self.process_proxies(proxy_type, proxies=proxies)
self.session.close()
end_time = time.time()
minutes, seconds = divmod(end_time - start_time, 60)
self.log("info", f"Total proxies checked: {self.total_proxies_checked}. Working proxies: {self.working_proxies_found}.")
self.log("info", f"Execution time: {int(minutes)} minutes {int(seconds)} seconds.")
self.log("info", "Statistics:\n" + self.get_statistics())
# Append history log
try:
with open("history.log", "a") as hist_file:
hist_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {self.get_statistics()}\n")
except OSError as e:
self.log("error", f"Failed to write history log: {e}")
class ProxyCheckerWorker(QObject):
"""
Worker class to run the proxy checking process in a separate thread.
Emits log messages, progress updates, and a finished signal.
"""
log_signal = pyqtSignal(str)
progress_update = pyqtSignal(int)
finished = pyqtSignal()
def __init__(self,
proxy_urls: Dict[str, str],
timeout: int,
max_retries: int,
retry_delay: float,
max_workers: int,
check_url: str,
detailed_results: bool,
export_format: str,
user_agent: Optional[str] = None):
super().__init__()
self.proxy_urls = proxy_urls
self.timeout = timeout
self.max_retries = max_retries
self.retry_delay = retry_delay
self.max_workers = max_workers
self.check_url = check_url
self.detailed_results = detailed_results
self.export_format = export_format
self.user_agent = user_agent
self.checker: Optional[ProxyChecker] = None
def log_callback(self, message: str) -> None:
self.log_signal.emit(message)
def progress_callback(self, progress: int) -> None:
self.progress_update.emit(progress)
def cancel(self) -> None:
if self.checker is not None:
self.checker.cancel()
def run(self) -> None:
self.checker = ProxyChecker(
proxy_urls=self.proxy_urls,
timeout=self.timeout,
max_retries=self.max_retries,
retry_delay=self.retry_delay,
max_workers=self.max_workers,
check_url=self.check_url,
detailed_results=self.detailed_results,
export_format=self.export_format,
user_agent=self.user_agent,
log_callback=self.log_callback,
progress_callback=self.progress_callback
)
self.log_callback("Starting proxy checking...")
self.checker.run()
self.log_callback("Proxy checking finished.")
self.finished.emit()
class UpdateChecker(QObject):
"""
Worker class to check for software updates.
"""
update_checked = pyqtSignal(str)
def run(self) -> None:
try:
response = requests.get("https://api.github.com/repos/Jesewe/proxy-checker/releases/latest", timeout=5)
response.raise_for_status()
data = response.json()
latest_version = data["tag_name"].lstrip("v")
if latest_version != CURRENT_VERSION:
msg = (f"New version available: {latest_version}.\n"
f"You are using version {CURRENT_VERSION}.\n"
f"Visit {data['html_url']} to download the update.")
else:
msg = f"You are up-to-date with version {CURRENT_VERSION}."
except Exception as e:
msg = f"Failed to check for updates: {e}"
self.update_checked.emit(msg)
class MainWindow(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle("Proxy Checker")
self.setGeometry(100, 100, 850, 750)
self.init_ui()
self.thread: Optional[QThread] = None
self.worker: Optional[ProxyCheckerWorker] = None
self.update_thread: Optional[QThread] = None
self.last_checker: Optional[ProxyChecker] = None
self.is_paused = False
def init_ui(self):
main_widget = QWidget()
main_layout = QVBoxLayout()
# Configuration group
config_group = QGroupBox("Settings")
config_layout = QGridLayout()
# Timeout
config_layout.addWidget(QLabel("Timeout (s):"), 0, 0)
self.timeout_spin = QSpinBox()
self.timeout_spin.setRange(1, 60)
self.timeout_spin.setValue(3)
config_layout.addWidget(self.timeout_spin, 0, 1)
# Max Retries
config_layout.addWidget(QLabel("Max Retries:"), 0, 2)
self.retries_spin = QSpinBox()
self.retries_spin.setRange(1, 10)
self.retries_spin.setValue(3)
config_layout.addWidget(self.retries_spin, 0, 3)
# Retry Delay
config_layout.addWidget(QLabel("Retry Delay (s):"), 1, 0)
self.retry_delay_spin = QDoubleSpinBox()
self.retry_delay_spin.setRange(0.1, 10.0)
self.retry_delay_spin.setSingleStep(0.1)
self.retry_delay_spin.setValue(1.0)
config_layout.addWidget(self.retry_delay_spin, 1, 1)
# Max Workers
config_layout.addWidget(QLabel("Max Workers:"), 1, 2)
self.workers_spin = QSpinBox()
self.workers_spin.setRange(1, 200)
self.workers_spin.setValue(50)
config_layout.addWidget(self.workers_spin, 1, 3)
# Test URL
config_layout.addWidget(QLabel("Test URL:"), 2, 0)
self.test_url_edit = QLineEdit("http://www.google.com")
config_layout.addWidget(self.test_url_edit, 2, 1, 1, 3)
# Custom User-Agent
config_layout.addWidget(QLabel("Custom User-Agent:"), 3, 0)
self.user_agent_edit = QLineEdit("")
self.user_agent_edit.setPlaceholderText("Leave blank for default")
config_layout.addWidget(self.user_agent_edit, 3, 1, 1, 3)
# Detailed Results Option
self.detailed_checkbox = QCheckBox("Detailed Results (Include Response Time, Anonymity & Geo)")
config_layout.addWidget(self.detailed_checkbox, 4, 0, 1, 2)
# Export Format Option
config_layout.addWidget(QLabel("Export Format:"), 4, 2)
self.export_format_combo = QComboBox()
self.export_format_combo.addItems(["txt", "csv", "json"])
config_layout.addWidget(self.export_format_combo, 4, 3)
config_group.setLayout(config_layout)
main_layout.addWidget(config_group)
# Proxy Sources Group
proxy_group = QGroupBox("Proxy Sources")
proxy_layout = QGridLayout()
self.proxy_urls = {
"http": "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt",
"socks4": "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt",
"socks5": "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt"
}
self.proxy_type_checkboxes = {}
self.proxy_url_edits = {}
row = 0
for proxy_type, url in self.proxy_urls.items():
checkbox = QCheckBox(proxy_type)
checkbox.setChecked(True)
self.proxy_type_checkboxes[proxy_type] = checkbox
proxy_layout.addWidget(checkbox, row, 0)
url_edit = QLineEdit(url)
self.proxy_url_edits[proxy_type] = url_edit
proxy_layout.addWidget(url_edit, row, 1)
row += 1
proxy_group.setLayout(proxy_layout)
main_layout.addWidget(proxy_group)
# Progress Bar
self.progress_bar = QProgressBar()
self.progress_bar.setRange(0, 100)
self.progress_bar.setValue(0)
main_layout.addWidget(self.progress_bar)
# Main Buttons
btn_layout = QHBoxLayout()
self.start_btn = QPushButton("Start Checking")
self.start_btn.clicked.connect(self.start_checking)
btn_layout.addWidget(self.start_btn)
self.pause_btn = QPushButton("Pause")
self.pause_btn.setEnabled(False)
self.pause_btn.clicked.connect(self.toggle_pause)
btn_layout.addWidget(self.pause_btn)
self.cancel_btn = QPushButton("Cancel")
self.cancel_btn.setEnabled(False)
self.cancel_btn.clicked.connect(self.cancel_checking)
btn_layout.addWidget(self.cancel_btn)
self.show_results_btn = QPushButton("Show Results")
self.show_results_btn.setEnabled(False)
self.show_results_btn.clicked.connect(self.show_results)
btn_layout.addWidget(self.show_results_btn)
main_layout.addLayout(btn_layout)
# Extra Buttons: Show Statistics, Save Log
extra_btn_layout = QHBoxLayout()
self.show_stats_btn = QPushButton("Show Statistics")
self.show_stats_btn.setEnabled(False)
self.show_stats_btn.clicked.connect(self.show_statistics)
extra_btn_layout.addWidget(self.show_stats_btn)
self.save_log_btn = QPushButton("Save Log")
self.save_log_btn.clicked.connect(self.save_log)
extra_btn_layout.addWidget(self.save_log_btn)
main_layout.addLayout(extra_btn_layout)
# Log Text Area
self.log_text = QTextEdit()
self.log_text.setReadOnly(True)
self.log_text.setStyleSheet("background-color: #1e1e1e; color: #d4d4d4; font-family: Consolas; font-size: 12pt;")
main_layout.addWidget(self.log_text)
main_widget.setLayout(main_layout)
self.setCentralWidget(main_widget)
def start_checking(self):
self.start_btn.setEnabled(False)
self.cancel_btn.setEnabled(True)
self.pause_btn.setEnabled(True)
self.show_results_btn.setEnabled(False)
self.show_stats_btn.setEnabled(False)
self.progress_bar.setValue(0)
self.log_text.clear()
# Build proxy_urls from selected checkboxes.
selected_proxy_urls = {}
for proxy_type, checkbox in self.proxy_type_checkboxes.items():
if checkbox.isChecked():
url = self.proxy_url_edits[proxy_type].text().strip()
if url:
selected_proxy_urls[proxy_type] = url
if not selected_proxy_urls:
QMessageBox.warning(self, "No Proxies Selected", "Please select at least one proxy type to check.")
self.start_btn.setEnabled(True)
self.cancel_btn.setEnabled(False)
self.pause_btn.setEnabled(False)
return
# Get settings from UI.
timeout = self.timeout_spin.value()
max_retries = self.retries_spin.value()
retry_delay = self.retry_delay_spin.value()
max_workers = self.workers_spin.value()
check_url = self.test_url_edit.text().strip()
detailed_results = self.detailed_checkbox.isChecked()
export_format = self.export_format_combo.currentText().strip()
user_agent = self.user_agent_edit.text().strip() or None
self.thread = QThread()
self.worker = ProxyCheckerWorker(
proxy_urls=selected_proxy_urls,
timeout=timeout,
max_retries=max_retries,
retry_delay=retry_delay,
max_workers=max_workers,
check_url=check_url,
detailed_results=detailed_results,
export_format=export_format,
user_agent=user_agent
)
self.worker.moveToThread(self.thread)
self.worker.log_signal.connect(self.append_log)
self.worker.progress_update.connect(self.progress_bar.setValue)
self.worker.finished.connect(self.on_finished)
self.thread.started.connect(self.worker.run)
self.thread.finished.connect(self.thread.deleteLater)
self.thread.start()
def toggle_pause(self):
if self.worker and self.worker.checker:
if not self.is_paused:
self.worker.checker.pause()
self.is_paused = True
self.pause_btn.setText("Resume")
self.append_log("Paused proxy checking.")
else:
self.worker.checker.resume()
self.is_paused = False
self.pause_btn.setText("Pause")
self.append_log("Resumed proxy checking.")
def cancel_checking(self):
if self.worker is not None:
self.append_log("Cancel requested by user...")
self.worker.cancel()
self.cancel_btn.setEnabled(False)
def append_log(self, message: str):
timestamp = time.strftime("%H:%M:%S")
self.log_text.append(f"[{timestamp}] {message}")
def on_finished(self):
self.append_log("All tasks completed.")
self.start_btn.setEnabled(True)
self.cancel_btn.setEnabled(False)
self.pause_btn.setEnabled(False)
self.show_results_btn.setEnabled(True)
self.show_stats_btn.setEnabled(True)
if self.thread is not None:
self.thread.quit()
self.thread.wait()
# Save a reference to the last checker for filtering results.
if self.worker:
self.last_checker = self.worker.checker
def show_results(self):
# If detailed results are enabled, allow filtering by response time.
if self.last_checker and self.last_checker.detailed_results:
dialog = QDialog(self)
dialog.setWindowTitle("Filtered Working Proxies")
dialog.resize(600, 500)
layout = QVBoxLayout()
filter_layout = QHBoxLayout()
filter_layout.addWidget(QLabel("Max Response Time (s):"))
filter_spin = QDoubleSpinBox()
filter_spin.setRange(0.1, 10.0)
filter_spin.setSingleStep(0.1)
filter_spin.setValue(1.0)
filter_layout.addWidget(filter_spin)
apply_btn = QPushButton("Apply Filter")
filter_layout.addWidget(apply_btn)
layout.addLayout(filter_layout)
result_area = QTextEdit()
result_area.setReadOnly(True)
layout.addWidget(result_area)
def apply_filter():
threshold = filter_spin.value()
text = ""
for ptype, results in self.last_checker.working_results.items():
filtered = []
for item in results:
if isinstance(item, dict) and item.get("response_time") <= threshold:
geo = item.get("geo", {})
filtered.append(f"{item.get('proxy')} - {item.get('response_time'):.2f} s - {item.get('anonymity')} - {geo.get('country', '')}")
if filtered:
text += f"--- {ptype} ---\n" + "\n".join(filtered) + "\n\n"
result_area.setText(text if text else "No proxies match the filter criteria.")
apply_btn.clicked.connect(apply_filter)
# Show all results initially
apply_filter()
btn_layout = QHBoxLayout()
copy_btn = QPushButton("Copy to Clipboard")
copy_btn.clicked.connect(lambda: QApplication.clipboard().setText(result_area.toPlainText()))
btn_layout.addWidget(copy_btn)
close_btn = QPushButton("Close")
close_btn.clicked.connect(dialog.close)
btn_layout.addWidget(close_btn)
layout.addLayout(btn_layout)
dialog.setLayout(layout)
dialog.exec()
else:
# Fallback: read the exported files from the proxies directory.
results_text = ""
proxy_dir = "proxies"
if os.path.isdir(proxy_dir):
for filename in os.listdir(proxy_dir):
filepath = os.path.join(proxy_dir, filename)
results_text += f"--- {filename} ---\n"
try:
with open(filepath, 'r') as f:
results_text += f.read() + "\n"
except OSError as e:
results_text += f"Error reading file: {e}\n"
else:
results_text = "No results found."
dialog = QDialog(self)
dialog.setWindowTitle("Working Proxies")
dialog.resize(600, 400)
dlg_layout = QVBoxLayout()
text_area = QTextEdit()
text_area.setReadOnly(True)
text_area.setText(results_text)
dlg_layout.addWidget(text_area)
btn_layout = QHBoxLayout()
copy_btn = QPushButton("Copy to Clipboard")
copy_btn.clicked.connect(lambda: QApplication.clipboard().setText(results_text))
btn_layout.addWidget(copy_btn)
close_btn = QPushButton("Close")
close_btn.clicked.connect(dialog.close)
btn_layout.addWidget(close_btn)
dlg_layout.addLayout(btn_layout)
dialog.setLayout(dlg_layout)
dialog.exec()
def show_statistics(self):
if self.worker and self.worker.checker:
stats = self.worker.checker.get_statistics()
else:
stats = "No statistics available."
QMessageBox.information(self, "Statistics", stats)
def save_log(self):
filename, _ = QFileDialog.getSaveFileName(self, "Save Log", "", "Text Files (*.txt);;All Files (*)")
if filename:
try:
with open(filename, 'w') as f:
f.write(self.log_text.toPlainText())
QMessageBox.information(self, "Saved", f"Log saved to {filename}")
except OSError as e:
QMessageBox.warning(self, "Error", f"Failed to save log: {e}")
def auto_check_for_update(self):
self.update_thread = QThread()
self.update_worker = UpdateChecker()
self.update_worker.moveToThread(self.update_thread)
self.update_worker.update_checked.connect(self.show_update_message)
self.update_thread.started.connect(self.update_worker.run)
self.update_thread.start()
def show_update_message(self, msg: str):
QMessageBox.information(self, "Update Check", msg)
self.update_thread.quit()
self.update_thread.wait()
def showEvent(self, event):
super().showEvent(event)
QTimer.singleShot(1000, self.auto_check_for_update)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
app = QApplication(sys.argv)
window = MainWindow()
window.show()
sys.exit(app.exec())

View File

@ -0,0 +1,941 @@
from airflow import DAG
from airflow.models import BaseOperator, Variable
from airflow.utils.decorators import apply_defaults
from airflow.hooks.base import BaseHook
from airflow.exceptions import AirflowException
from airflow.utils.dates import days_ago
from thrift.transport import TSocket, TTransport
from thrift.protocol import TBinaryProtocol
from thrift.transport.TTransport import TTransportException
from datetime import datetime, timedelta
from pangramia.yt.exceptions.ttypes import PBServiceException
import redis
import logging
import time
import socket
import json
import os
from pangramia.yt.tokens_ops import YTTokenOpService
from pangramia.yt.common.ttypes import TokenUpdateMode
from airflow.providers.redis.hooks.redis import RedisHook
from airflow.operators.python import PythonOperator
from airflow.models.param import Param
# Assuming ytdlp_utils exists in the same directory or PYTHONPATH
# from ytdlp_utils import get_info_json, is_valid_json, extract_video_id
# Configure logging
logger = logging.getLogger(__name__)
# Default settings (similar to ytdlp_client_dag.py)
MAX_RETRIES = 1
RETRY_DELAY = timedelta(seconds=10)
DEFAULT_TIMEOUT = 30
class YtdlpOpsOperator(BaseOperator):
"""
Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections
and Redis-based discovery, retrieves tokens, saves info.json, and manages errors.
"""
template_fields = ('url', 'service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir')
@apply_defaults
def __init__(self, url, redis_conn_id='redis_default', max_retries=3, retry_delay=10,
service_ip=None, service_port=None, redis_enabled=False, account_id=None,
save_info_json=True, info_json_dir=None, get_socks_proxy=True,
store_socks_proxy=False, timeout=DEFAULT_TIMEOUT, *args, **kwargs):
super().__init__(*args, **kwargs)
logger.info(f"Initializing YtdlpOpsOperator with parameters: url={url}, "
f"redis_conn_id={redis_conn_id}, max_retries={max_retries}, retry_delay={retry_delay}, "
f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, "
f"account_id={account_id}, save_info_json={save_info_json}, info_json_dir={info_json_dir}, "
f"get_socks_proxy={get_socks_proxy}, store_socks_proxy={store_socks_proxy}, timeout={timeout}")
# Validate required parameters
if not url:
raise ValueError("url is required")
# Validate parameters based on connection mode
if redis_enabled:
if not account_id:
raise ValueError("account_id is required when redis_enabled=True")
# Use default Redis connection if not specified
if not redis_conn_id:
redis_conn_id = 'redis_default'
logger.info(f"Using default Redis connection ID: {redis_conn_id}")
else:
if not service_ip or not service_port:
raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False")
if not account_id:
logger.warning("No account_id provided for direct connection mode. Using 'default'")
account_id = 'default' # Assign default if missing in direct mode
self.url = url
self.redis_conn_id = redis_conn_id
self.max_retries = max_retries
self.retry_delay = int(retry_delay.total_seconds() if isinstance(retry_delay, timedelta) else retry_delay)
self.service_ip = service_ip
self.service_port = service_port
self.redis_enabled = redis_enabled
self.account_id = account_id
self.save_info_json = save_info_json
self.info_json_dir = info_json_dir
self.get_socks_proxy = get_socks_proxy
self.store_socks_proxy = store_socks_proxy
self.timeout = timeout
def execute(self, context):
logger.info("Executing YtdlpOpsOperator")
transport = None
try:
logger.info("Getting task parameters")
params = context.get('params', {})
redis_enabled = params.get('redis_enabled', self.redis_enabled)
logger.info(f"Using redis_enabled={redis_enabled} (from {'task params' if 'redis_enabled' in params else 'operator init'})")
# Determine account_id to use (from params or operator default)
account_id = context['params'].get('account_id', self.account_id)
logger.info(f"Using account_id='{account_id}' (from {'task params' if 'account_id' in params else 'operator init'})")
if redis_enabled:
# Get Redis connection with proper authentication and error handling
redis_conn = BaseHook.get_connection(self.redis_conn_id)
redis_client = redis.Redis(
host=redis_conn.host,
port=redis_conn.port,
password=redis_conn.password,
db=0,
decode_responses=True # Important for consistent key handling
)
# Test Redis connection
try:
if not redis_client.ping():
raise redis.exceptions.ConnectionError("Redis ping failed")
logger.info(f"Successfully connected to Redis at {redis_conn.host}:{redis_conn.port}")
except redis.exceptions.AuthenticationError:
logger.error(f"Redis authentication failed for connection '{self.redis_conn_id}'. Check password.")
raise AirflowException("Redis authentication failed.")
except redis.exceptions.ConnectionError as e:
logger.error(f"Could not connect to Redis at {redis_conn.host}:{redis_conn.port}. Error: {e}")
raise AirflowException(f"Redis connection failed: {e}")
except Exception as e:
logger.error(f"Unexpected Redis error: {str(e)}")
raise AirflowException(f"Unexpected Redis error: {e}")
# Get service details from Redis with retries and proper key handling
service_key = f"ytdlp:{account_id}"
legacy_key = account_id # For backward compatibility
host = None
port = None
for attempt in range(self.max_retries):
try:
logger.info(f"Attempt {attempt + 1}/{self.max_retries}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'")
service_details = redis_client.hgetall(service_key)
if not service_details:
logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'")
service_details = redis_client.hgetall(legacy_key)
if not service_details:
raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}")
# Find IP and port, handling potential case differences and byte/string types
ip_key = next((k for k in service_details if k.lower() == 'ip'), None)
port_key = next((k for k in service_details if k.lower() == 'port'), None)
if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}")
if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}")
host = service_details[ip_key] # Already decoded due to decode_responses=True
port_str = service_details[port_key]
try:
port = int(port_str)
except ValueError:
raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}")
logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}")
break # Success
except Exception as e:
logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}")
if attempt == self.max_retries - 1:
logger.error("Max retries reached for fetching Redis details.")
raise AirflowException(f"Failed to get service details from Redis after {self.max_retries} attempts: {e}")
logger.info(f"Retrying in {self.retry_delay} seconds...")
time.sleep(self.retry_delay)
else:
# Direct connection: Log parameter sources
params = context.get('params', {})
host = params.get('service_ip', self.service_ip)
host_source = 'task params' if 'service_ip' in params else 'operator init'
port_str = params.get('service_port', self.service_port)
port_source = 'task params' if 'service_port' in params else 'operator init'
url = params.get('url', self.url)
url_source = 'task params' if 'url' in params else 'operator init'
logger.info(f"Using service_ip={host} (from {host_source})")
logger.info(f"Using service_port={port_str} (from {port_source})")
logger.info(f"Using url={url} (from {url_source})")
if not host or not port_str:
raise ValueError("Direct connection requires service_ip and service_port")
try:
port = int(port_str)
except ValueError:
raise ValueError(f"Invalid service_port value: {port_str}")
logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)")
# Render and validate timeout
timeout_param = context.get('params', {}).get('timeout', self.timeout)
if isinstance(self.timeout, str) and '{{' in self.timeout:
timeout_rendered = self.render_template(self.timeout, context)
logger.info(f"Rendered timeout template: '{self.timeout}' -> '{timeout_rendered}'")
timeout_param = timeout_rendered
try:
timeout = int(timeout_param)
if timeout <= 0: raise ValueError("Timeout must be positive")
logger.info(f"Using timeout: {timeout} seconds")
except (ValueError, TypeError):
logger.warning(f"Invalid timeout value: '{timeout_param}'. Using default: {DEFAULT_TIMEOUT}")
timeout = DEFAULT_TIMEOUT
# Create Thrift connection objects
socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4)
socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds
transport = TTransport.TFramedTransport(socket_conn)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = YTTokenOpService.Client(protocol)
logger.info(f"Attempting to connect to Thrift server at {host}:{port}...")
try:
transport.open()
logger.info("Successfully connected to Thrift server.")
# Test connection with ping
try:
client.ping()
logger.info("Server ping successful.")
except Exception as e:
logger.error(f"Server ping failed: {e}")
raise AirflowException(f"Server connection test (ping) failed: {e}")
# Get token from service with specific error handling
try:
url_param = context.get('params', {}).get('url', self.url)
logger.info(f"Requesting token for accountId='{account_id}', url='{url_param}'")
token_data = client.getOrRefreshToken(
accountId=account_id,
updateType=TokenUpdateMode.AUTO,
url=url_param
)
logger.info("Successfully retrieved token data from service.")
except PBServiceException as e:
logger.error(f"PBServiceException occurred: Code={getattr(e, 'errorCode', 'N/A')}, Message={getattr(e, 'message', 'N/A')}")
error_code = getattr(e, 'errorCode', None)
error_msg = f"YTDLP service error: {getattr(e, 'message', str(e))}"
# Handle specific known error codes
if error_code in [
"SOCKS5_CONNECTION_FAILED", "SOCKS5_TIMEOUT",
"SOCKS5_CONNECTION_REFUSED", "SOCKS5_CONNECTION_TIMEOUT",
"SOCKS5_HOST_NOT_FOUND", "SOCKS5_NETWORK_UNREACHABLE"
]:
error_msg = f"SOCKS5 proxy error ({error_code}): {e.message}. Check proxy settings."
elif error_code == "BOT_DETECTION":
error_msg = f"Bot detection triggered ({error_code}): {e.message}."
suggestions = getattr(e, 'context', {}).get('suggestions', [])
if suggestions: error_msg += "\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions)
elif error_code == "NODEJS_SCRIPT_ERROR":
error_msg = f"Node.js script error ({error_code}): {e.message}."
elif error_code == "NODEJS_TIMEOUT":
error_msg = f"Node.js timeout ({error_code}): {e.message}."
# Add more specific error handling as needed
raise AirflowException(error_msg)
except TTransportException as e:
logger.error(f"Thrift transport error during getOrRefreshToken: {e}")
raise AirflowException(f"Transport error during API call: {e}")
except Exception as e:
logger.error(f"Unexpected error during getOrRefreshToken: {e}")
raise AirflowException(f"Unexpected error during API call: {e}")
except TTransportException as e:
# Handle connection-specific transport errors
if "read 0 bytes" in str(e) or "Could not connect to" in str(e) or "Connection refused" in str(e):
logger.error(f"Connection failed to {host}:{port}. Details: {e}")
logger.error("Possible causes: Server down, firewall block, incorrect IP/port.")
raise AirflowException(f"Failed to connect to YTDLP service at {host}:{port}: {e}")
else:
logger.error(f"Thrift transport error during connection: {str(e)}")
raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}")
except Exception as e:
logger.error(f"Unexpected error during connection or ping: {str(e)}")
raise # Re-raise other unexpected errors
# Log received token data attributes for debugging
logger.debug(f"Token data received. Attributes: {dir(token_data)}")
for attr in dir(token_data):
if not attr.startswith('__') and not callable(getattr(token_data, attr)): # Log non-callable attributes
value = getattr(token_data, attr)
if attr == 'infoJson' and value:
logger.debug(f"infoJson: {value[:50]}...")
else:
logger.debug(f"{attr}: {value}")
info_json_path = None # Initialize info_json_path
save_info_json_param = context['params'].get('save_info_json', self.save_info_json)
# Render if it's a string template
if isinstance(save_info_json_param, str):
save_info_json_rendered = self.render_template(save_info_json_param, context)
# Convert common string representations to boolean
save_info_json = str(save_info_json_rendered).lower() in ['true', '1', 't', 'y', 'yes']
else:
save_info_json = bool(save_info_json_param)
# Save info.json if requested and valid
if self.save_info_json:
info_json = self._get_info_json(token_data)
if info_json and self._is_valid_json(info_json):
try:
# Use internal _save_info_json method which handles rendering, dir creation, logging
info_json_path = self._save_info_json(context, info_json)
if info_json_path: # Check if saving was successful
context['task_instance'].xcom_push(key='info_json_path', value=info_json_path)
logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}")
else:
# _save_info_json should log errors, push None to indicate failure
context['task_instance'].xcom_push(key='info_json_path', value=None)
logger.warning("info.json saving failed (check logs from _save_info_json), pushing None to XCom for info_json_path.")
except Exception as e:
logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True)
context['task_instance'].xcom_push(key='info_json_path', value=None) # Push None on error
elif info_json:
logger.warning("Retrieved infoJson is not valid JSON. Skipping save.")
context['task_instance'].xcom_push(key='info_json_path', value=None)
else:
logger.info("No infoJson found in token data. Skipping save.")
context['task_instance'].xcom_push(key='info_json_path', value=None)
else:
logger.info("save_info_json is False. Skipping info.json save.")
context['task_instance'].xcom_push(key='info_json_path', value=None)
# Extract and potentially store SOCKS proxy
socks_proxy = None
if self.get_socks_proxy: # Use instance attribute
# Check for common attribute names for proxy
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
if proxy_attr:
socks_proxy = getattr(token_data, proxy_attr)
if socks_proxy: # Ensure proxy value is not empty
logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
if self.store_socks_proxy: # Use instance attribute
context['task_instance'].xcom_push(key='socks_proxy', value=socks_proxy)
logger.info(f"Pushed key 'socks_proxy' to XCom with value: {socks_proxy}")
else:
logger.info("SOCKS proxy extracted but not pushed to XCom (store_socks_proxy=False).")
else:
logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty. No proxy extracted.")
# Push None even if found but empty, if storing is enabled
if self.store_socks_proxy: # Use instance attribute
context['task_instance'].xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.")
else:
logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found in token data.")
# Push None if storing is enabled but attribute not found
if self.store_socks_proxy: # Use instance attribute
context['task_instance'].xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.")
else:
logger.info("get_socks_proxy is False. Skipping proxy extraction.")
# Push None if storing is enabled but extraction was skipped
if self.store_socks_proxy: # Use instance attribute
context['task_instance'].xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as get_socks_proxy=False.")
# Get the original command from the server
ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
if not ytdlp_cmd:
logger.error("No 'ytdlpCommand' attribute found in token data.")
raise AirflowException("Required 'ytdlpCommand' not received from service.")
logger.info(f"Original command received from server: {ytdlp_cmd}")
# Log example usage command (DO NOT MODIFY the original command here)
if info_json_path:
# Use double quotes for paths/proxy in example for robustness
example_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
if socks_proxy:
example_cmd += f" --proxy \"{socks_proxy}\""
example_cmd += " --verbose --simulate" # Add useful flags for testing
logger.info(f"\n--- Example usage with saved info.json ---")
logger.info(example_cmd)
logger.info(f"(Note: The actual command with tokens/cookies is pushed to XCom as 'ytdlp_command')")
latest_json_path = os.path.join(os.path.dirname(info_json_path), 'latest.json')
logger.info(f"(You can also use 'latest.json': {latest_json_path})")
logger.info(f"-------------------------------------------\n")
else:
logger.info("\n--- Original command pushed to XCom ('ytdlp_command') ---")
if socks_proxy:
logger.info(f"Use the extracted proxy '{socks_proxy}' (pushed to XCom if store_socks_proxy=True) with the --proxy flag.")
logger.info("Add --verbose and --simulate flags for testing the command.")
logger.info(f"-------------------------------------------------------\n")
# Push the *original* command to XCom
context['task_instance'].xcom_push(key='ytdlp_command', value=ytdlp_cmd)
logger.info(f"Pushed original command to XCom key 'ytdlp_command'.")
# Note: Returning ytdlp_cmd below implicitly pushes the same value
# to XCom under the key 'return_value'. Downstream tasks should
# preferably use the explicitly pushed 'ytdlp_command' key for clarity.
return ytdlp_cmd # Return the original command
except AirflowException as e: # Catch AirflowExceptions raised explicitly in the code above
logger.error(f"Operation failed due to AirflowException: {e}")
raise # Re-raise AirflowExceptions to ensure task failure
except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already wrapped
logger.error(f"Unhandled Thrift/Service error: {e}", exc_info=True) # Add traceback for context
raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException
except Exception as e: # General catch-all for truly unexpected errors
# Log with traceback for unexpected errors
logger.error(f"Caught unexpected error in YtdlpOpsOperator: {e}", exc_info=True)
# Ensure any unexpected error explicitly fails the task with AirflowException
raise AirflowException(f"Unexpected error caused task failure: {e}")
finally:
if transport and transport.isOpen(): # Check if transport exists and is open before closing
logger.info("Closing Thrift transport.")
transport.close()
# --- Helper Methods ---
def _get_info_json(self, token_data):
"""Safely extracts infoJson from token data."""
info_json = getattr(token_data, 'infoJson', None)
if info_json:
logger.debug("Extracted infoJson from token data.")
else:
logger.debug("No infoJson attribute found in token data.")
return info_json
def _is_valid_json(self, json_str):
"""Checks if a string is valid JSON."""
if not json_str or not isinstance(json_str, str):
logger.debug("Input is not a non-empty string, considered invalid JSON.")
return False
try:
json.loads(json_str)
logger.debug("JSON string validation successful.")
return True
except json.JSONDecodeError as e:
logger.warning(f"JSON validation failed: {e}")
return False
def _save_info_json(self, context, info_json):
"""Saves info_json to a file, handling directory creation and logging. Returns the path on success, None on failure."""
try:
# Get URL from params/context for video ID extraction
url_param = context.get('params', {}).get('url', self.url)
video_id = self._extract_video_id(url_param) # Use internal helper
# Render the info_json_dir template
save_dir_template = self.info_json_dir or "." # Default to current dir if template is None or empty string
save_dir = self.render_template(save_dir_template, context)
if not save_dir: # Handle case where template renders to empty string
logger.warning(f"Rendered info_json_dir template '{save_dir_template}' resulted in an empty path. Defaulting to '.'")
save_dir = "."
logger.info(f"Target directory for info.json (rendered): {save_dir}")
# Ensure directory exists
try:
os.makedirs(save_dir, exist_ok=True)
logger.info(f"Ensured directory exists: {save_dir}")
except OSError as e:
logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.")
return None # Indicate failure
# Construct filename (using potentially overridden account_id)
account_id_param = context.get('params', {}).get('account_id', self.account_id)
timestamp = int(time.time())
base_filename = f"info_{video_id}_{account_id_param}_{timestamp}.json" if video_id else f"info_{account_id_param}_{timestamp}.json"
info_json_path = os.path.join(save_dir, base_filename)
latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy
# Write to timestamped file
try:
logger.info(f"Writing info.json content (received from service) to {info_json_path}...")
with open(info_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}")
except IOError as e:
logger.error(f"Failed to write info.json to {info_json_path}: {e}")
return None # Indicate failure
# Write to latest.json (overwrite) - best effort
try:
with open(latest_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
logger.info(f"Updated latest.json file: {latest_json_path}")
except IOError as e:
# Log warning but don't fail the whole save if only latest.json fails
logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}")
return info_json_path # Return path on success (even if latest.json failed)
except Exception as e:
logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True)
return None # Indicate failure
def _extract_video_id(self, url):
"""Extracts YouTube video ID from URL (internal helper)."""
if not url or not isinstance(url, str):
logger.debug("URL is empty or not a string, cannot extract video ID.")
return None
try:
# Basic extraction logic (can be enhanced for more URL types)
video_id = None
if 'youtube.com/watch?v=' in url:
video_id = url.split('v=')[1].split('&')[0]
elif 'youtu.be/' in url:
video_id = url.split('youtu.be/')[1].split('?')[0]
# Ensure it looks like a video ID (typically 11 chars, but can vary)
if video_id and len(video_id) >= 11:
video_id = video_id[:11] # Take first 11 chars as standard ID length
logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
return video_id
else:
logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
return None
except Exception as e:
logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
return None
# =============================================================================
# Python Callables for Tasks
# =============================================================================
def display_token_info(**context):
"""Displays token info from XCom, parses info.json, and logs example commands."""
ti = context['task_instance']
logger.info("Starting display_token_info task.")
# Pull data from XCom (provide default values)
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
logger.info("\n=== Pulled Token Information from XCom ===")
logger.info(f"Info.json path: {info_json_path or 'Not found/Not saved'}")
logger.info(f"SOCKS Proxy: {socks_proxy or 'Not found/Not extracted'}")
logger.info(f"Original yt-dlp command (with tokens): {ytdlp_command or 'Not found'}")
result = {
'info_path': info_json_path,
'proxy': socks_proxy,
'ytdlp_command': ytdlp_command,
'video_info': None,
'commands': {},
'error': None
}
if info_json_path and os.path.exists(info_json_path):
logger.info(f"\n=== Processing Video Information from: {info_json_path} ===")
try:
with open(info_json_path, 'r', encoding='utf-8') as f:
info = json.load(f)
# Extract and log basic video info safely
title = info.get('title', 'Unknown Title')
uploader = info.get('uploader', 'Unknown Author')
duration = info.get('duration_string', 'Unknown Length')
upload_date_str = info.get('upload_date') # Format: YYYYMMDD
upload_date_formatted = 'Unknown Date'
if upload_date_str:
try:
# Validate format before parsing
if len(upload_date_str) == 8 and upload_date_str.isdigit():
upload_date_formatted = datetime.strptime(upload_date_str, '%Y%m%d').strftime('%Y-%m-%d')
else:
logger.warning(f"Upload date '{upload_date_str}' is not in YYYYMMDD format.")
except ValueError:
logger.warning(f"Could not parse upload_date '{upload_date_str}'")
result['video_info'] = {
'title': title,
'uploader': uploader,
'upload_date': upload_date_formatted, # Store formatted date
'duration': duration
}
logger.info(f"Title: {title}")
logger.info(f"Author: {uploader}")
logger.info(f"Date: {upload_date_formatted}")
logger.info(f"Length: {duration}")
logger.info("\n=== Example yt-dlp Commands (using saved info.json) ===")
base_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
if socks_proxy:
base_cmd += f" --proxy \"{socks_proxy}\""
# Command to list formats
format_cmd = f"{base_cmd} -F"
result['commands']['format'] = format_cmd
logger.info(f"List formats command: {format_cmd}")
# Execute and log the format listing command
logger.info("\n--- Executing Format List Command ---")
try:
# Use os.popen for simplicity, capture output
logger.info(f"Running: {format_cmd}")
format_output = os.popen(format_cmd).read()
logger.info("--- Format List Output ---")
logger.info(format_output)
logger.info("--------------------------")
except Exception as e:
logger.error(f"Error executing format command: {e}")
# Command to simulate download
simulate_cmd = f"{base_cmd} --simulate --verbose" # Add verbose for more info
result['commands']['simulate'] = simulate_cmd
logger.info(f"Simulate download command: {simulate_cmd}")
# Execute and log the simulation command
logger.info("\n--- Executing Simulation Command ---")
try:
logger.info(f"Running: {simulate_cmd}")
simulate_output = os.popen(simulate_cmd).read()
logger.info("--- Simulation Output ---")
logger.info(simulate_output)
logger.info("-------------------------")
except Exception as e:
logger.error(f"Error executing simulation command: {e}")
# Basic download command
download_cmd = base_cmd
result['commands']['download_base'] = download_cmd
logger.info(f"Base download command (add format selection, output path): {download_cmd}")
# Push generated example commands to XCom for potential downstream use
# ti.xcom_push(key='format_cmd', value=format_cmd) # Removed as requested
# ti.xcom_push(key='simulate_cmd', value=simulate_cmd) # Removed as requested
ti.xcom_push(key='download_cmd', value=download_cmd)
logger.info(f"Pushed key 'download_cmd' to XCom with value: {download_cmd}")
except json.JSONDecodeError as e:
error_msg = f"Failed to parse info.json file '{info_json_path}': {e}"
logger.error(error_msg)
result['error'] = error_msg
except FileNotFoundError:
error_msg = f"Info.json file not found at path: {info_json_path}"
logger.error(error_msg)
result['error'] = error_msg
except Exception as e:
error_msg = f"Error processing info.json file '{info_json_path}': {str(e)}"
logger.error(error_msg, exc_info=True)
result['error'] = error_msg
elif info_json_path:
error_msg = f"Info.json path provided ('{info_json_path}') but file does not exist."
logger.warning(error_msg)
result['error'] = error_msg
else:
logger.warning("No info.json path found in XCom. Cannot display video details or generate example commands.")
result['error'] = "Info.json path not available."
logger.info("Finished display_token_info task.")
# Return the collected information (useful if used as a PythonOperator return value)
return json.dumps(result) # Return as JSON string for XCom compatibility if needed
def store_token_info(**context):
"""Stores retrieved token information (command, proxy, info.json) in Redis."""
ti = context['task_instance']
# Use the redis_conn_id defined in the operator/DAG params if possible, else default
redis_conn_id = context['params'].get('redis_conn_id', 'redis_default')
redis_hook = RedisHook(redis_conn_id=redis_conn_id)
logger.info(f"Starting store_token_info task using Redis connection '{redis_conn_id}'.")
try:
# Pull necessary data from XCom and context
url = context['params'].get('url')
if not url:
# Attempt to get URL from DAG run conf as fallback
url = context.get('dag_run', {}).conf.get('url')
if not url:
raise ValueError("URL parameter is missing in context['params'] and dag_run.conf")
logger.warning("URL parameter missing in context['params'], using URL from dag_run.conf.")
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') or '' # Default to empty string if None
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
if not ytdlp_command:
logger.warning("ytdlp_command not found in XCom. Storing empty value.")
ytdlp_command = '' # Store empty if not found
# Construct the base command using info.json
ytdlp_command_base = ''
if info_json_path and os.path.exists(info_json_path):
ytdlp_command_base = f"yt-dlp --load-info-json \"{info_json_path}\""
logger.info(f"Constructed base command: {ytdlp_command_base}")
else:
logger.warning("Cannot construct base command: info_json_path not valid.")
# Construct the command with tokens and proxy
ytdlp_command_tokens = ytdlp_command # Start with original command from server
if socks_proxy:
ytdlp_command_tokens += f" --proxy \"{socks_proxy}\""
logger.info("Appended proxy to token command.")
data_to_store = {
'url': url,
'ytdlp_command': ytdlp_command_base, # Store the base command
'proxy': socks_proxy,
'info_json_path': info_json_path or '' # Store path even if None/empty
# 'info_json' will be added below
}
# Read info.json content if path exists
info_json_content = None
if info_json_path and os.path.exists(info_json_path):
try:
with open(info_json_path, 'r', encoding='utf-8') as f:
# Read and immediately validate JSON structure before storing
info_json_content = json.load(f)
# Store the validated JSON as a string
data_to_store['info_json'] = json.dumps(info_json_content)
logger.info(f"Read and validated info.json content from: {info_json_path}")
except json.JSONDecodeError as e:
logger.error(f"Failed to parse info.json file '{info_json_path}' as JSON: {e}. Storing empty content.")
data_to_store['info_json'] = '' # Store empty string on parse error
except Exception as e:
logger.error(f"Failed to read info.json file '{info_json_path}': {e}. Storing empty content.")
data_to_store['info_json'] = '' # Store empty string on other read errors
else:
logger.warning(f"info_json_path ('{info_json_path}') not found or invalid. Storing without info_json content.")
data_to_store['info_json'] = '' # Store empty string if no path
# Determine Redis key using video ID
# Use the same helper method as the operator for consistency
# Need an instance or static method call. Let's make _extract_video_id static temporarily
# Or instantiate the operator just for this - less ideal.
# Simplest: Re-implement or assume utils.
# Re-implementing basic logic here for simplicity:
video_id = None
try:
if 'youtube.com/watch?v=' in url:
video_id = url.split('v=')[1].split('&')[0][:11]
elif 'youtu.be/' in url:
video_id = url.split('youtu.be/')[1].split('?')[0][:11]
except Exception:
pass # Ignore errors in ID extraction for key generation
redis_key = f"token_info:{video_id or 'unknown'}"
logger.info(f"Determined Redis key: {redis_key}")
# Store data in Redis hash
# Log presence/absence rather than full content for potentially large fields
logger.info(f"Data to store in Redis key '{redis_key}': "
f"URL='{data_to_store['url']}', "
f"Command={'<present>' if data_to_store['ytdlp_command'] else '<empty>'}, "
f"Proxy='{data_to_store['proxy'] or '<empty>'}', "
f"Path='{data_to_store['info_json_path'] or '<empty>'}', "
f"JSON Content={'<present>' if data_to_store.get('info_json') else '<empty>'}")
with redis_hook.get_conn() as redis_client:
# Extract video ID from URL
video_id = None
try:
if 'youtube.com/watch?v=' in url:
video_id = url.split('v=')[1].split('&')[0][:11]
elif 'youtu.be/' in url:
video_id = url.split('youtu.be/')[1].split('?')[0][:11]
except Exception:
pass # Ignore errors in ID extraction for key generation
# Use video ID as part of the Redis key
redis_key = f"token_info:{video_id or 'unknown'}"
logger.info(f"Determined Redis key: {redis_key}")
# Store data in Redis hash
# Add video_id, timestamp, and the constructed ytdlp_command_tokens
data_to_store['video_id'] = video_id or 'unknown'
data_to_store['timestamp'] = int(time.time())
data_to_store['ytdlp_command_tokens'] = ytdlp_command_tokens # Store the original token command
# Log fields being stored
log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
logger.info(f"Storing in Redis key '{redis_key}': {log_data}")
redis_client.hset(redis_key, mapping=data_to_store)
# Set expiration (e.g., 24 hours = 86400 seconds)
redis_client.expire(redis_key, 86400)
logger.info(f"Successfully stored token info in Redis key '{redis_key}' with 24h expiration.")
# Log the final stored data again for clarity
final_log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
logger.info(f"--- Final Data Stored in Redis Key '{redis_key}' ---")
logger.info(final_log_data)
logger.info("----------------------------------------------------")
except Exception as e:
logger.error(f"Failed to store token info in Redis: {e}", exc_info=True)
# Re-raise as AirflowException to fail the task
raise AirflowException(f"Failed to store token info in Redis: {e}")
logger.info("Finished store_token_info task.")
# =============================================================================
# DAG Definition
# =============================================================================
# Update default_args to match ytdlp_client_dag.py structure
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False, # Match reference DAG
'email_on_retry': False, # Match reference DAG
'retries': 1, # Default task retries
'retry_delay': timedelta(minutes=5), # Standard task retry delay
'start_date': days_ago(1) # Best practice start date
}
# Update DAG definition
with DAG(
dag_id='ytdlp_client_dag_v2.1',
default_args=default_args,
schedule_interval=None, # Manually triggered DAG
catchup=False, # Don't run for past missed schedules
description='DAG for YTDLP operations using Thrift client (V2 - Refactored)', # Updated description
tags=['ytdlp', 'thrift', 'client', 'v2'], # Updated tags for better filtering
params={
# Define DAG parameters with defaults and types for UI clarity
'url': Param('https://www.youtube.com/watch?v=sOlTX9uxUtM', type=["null", "string"], description="Required: The video URL to process."), # Default URL
'redis_enabled': Param(False, type="boolean", description="Use Redis for service discovery? If False, uses service_ip/port."), # Default to direct connection
'service_ip': Param('85.192.30.55', type="string", description="Service IP if redis_enabled=False."), # Default service IP
'service_port': Param(9090, type="integer", description="Service port if redis_enabled=False."), # Default service port
'account_id': Param('account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09', type="string", description="Account ID for Redis lookup or direct call."), # Updated default account_id
'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
# Use Airflow Variable for downloads directory, matching reference DAG structure
'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json. Uses Airflow Variable 'DOWNLOADS_TEMP' or default.")
}
) as dag:
# Define Tasks
get_token = YtdlpOpsOperator(
task_id='get_token',
# Pass templated parameters from DAG run config
url="{{ params.url }}",
redis_enabled="{{ params.redis_enabled }}",
service_ip="{{ params.service_ip }}",
service_port="{{ params.service_port }}",
account_id="{{ params.account_id }}",
save_info_json=True,
info_json_dir="{{ params.info_json_dir }}",
get_socks_proxy=True,
store_socks_proxy=True,
timeout="{{ params.timeout }}",
retries=MAX_RETRIES, # Operator-specific retries if needed, else use DAG default
retry_delay=RETRY_DELAY, # Operator-specific delay if needed
# Add callbacks for logging success/failure, similar to reference DAG
on_failure_callback=lambda context: logger.error(f"Task {context['task_instance_key_str']} failed."),
on_success_callback=lambda context: logger.info(f"Task {context['task_instance_key_str']} succeeded.")
)
# Add task documentation (visible in Airflow UI)
get_token.doc_md = """
### Get Token Task
Connects to the YTDLP Thrift service (either directly or via Redis discovery)
to retrieve an authentication token and video metadata (info.json).
**Pushes to XCom:**
- `info_json_path`: Path to the saved info.json file (or None if not saved/failed).
- `socks_proxy`: The extracted SOCKS proxy string (or None if not requested/found).
- `ytdlp_command`: The original command string received from the server (contains tokens/cookies).
- Uses parameters defined in the DAG run configuration.
"""
# Optional: Add a task to explicitly check XComs for debugging (like in reference DAG)
def _check_xcom_callable(**context):
"""Logs XCom values pushed by the get_token task."""
ti = context['task_instance']
logger.info("--- Checking XCom values pushed by get_token ---")
keys_to_check = ['info_json_path', 'socks_proxy', 'ytdlp_command']
xcom_values = {}
for key in keys_to_check:
value = ti.xcom_pull(task_ids='get_token', key=key)
xcom_values[key] = value
# Avoid logging potentially sensitive command details fully in production
if key == 'ytdlp_command' and value:
log_value = f"{value[:50]}..." # Log truncated command
else:
log_value = value
logger.info(f"XCom key='{key}': {log_value}")
logger.info("----------------------------------------------")
return xcom_values # Return values for potential future use
check_xcom_task = PythonOperator(
task_id='check_xcom_after_get_token',
python_callable=_check_xcom_callable,
)
check_xcom_task.doc_md = "Logs the values pushed to XCom by the 'get_token' task for debugging purposes."
display_info = PythonOperator(
task_id='display_token_info',
python_callable=display_token_info,
trigger_rule='all_success'
)
display_info.doc_md = """
### Display Token Info Task
Pulls information from XCom, parses the `info.json` file (if available),
logs video details, and generates example `yt-dlp` commands.
**Pulls from XCom (task_id='get_token'):**
- `info_json_path`
- `socks_proxy`
- `ytdlp_command`
**Pushes to XCom:**
- `download_cmd`: Base command using `--load-info-json` (user needs to add format/output).
"""
store_info = PythonOperator(
task_id='store_token_info', # Use consistent task ID naming
python_callable=store_token_info,
)
store_info.doc_md = """
### Store Token Info Task
Pulls information from XCom and DAG parameters, reads the `info.json` content,
and stores relevant data in a Redis hash.
**Pulls from XCom (task_id='get_token'):**
- `ytdlp_command`
- `socks_proxy`
- `info_json_path`
**Pulls from DAG context:**
- `params['url']` (or `dag_run.conf['url']`)
**Stores in Redis Hash (key: `token_info:<video_id>`):**
- `url`: The video URL.
- `ytdlp_command`: Base command using `--load-info-json`.
- `proxy`: The SOCKS proxy string.
- `info_json_path`: Path to the saved info.json file.
- `info_json`: The full content of the info.json file (as a JSON string).
- `video_id`: Extracted video ID.
- `timestamp`: Unix timestamp of storage.
- `ytdlp_command_tokens`: The original command string from the server (contains tokens/cookies).
Sets a 24-hour expiration on the Redis key.
"""
# Define task dependencies matching the reference DAG structure
get_token >> check_xcom_task >> display_info >> store_info

View File

@ -0,0 +1,179 @@
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2024 rl <rl@rlmbp>
#
# Distributed under terms of the MIT license.
"""
Airflow DAG for manually checking the status (type and size) of a specific Redis key used by YTDLP queues.
"""
from airflow import DAG
from airflow.exceptions import AirflowException
from airflow.models.param import Param
from airflow.operators.python import PythonOperator
from airflow.providers.redis.hooks.redis import RedisHook
from airflow.utils.dates import days_ago
from datetime import datetime, timedelta, timezone
import logging
import json
import redis # Import redis exceptions if needed
# Configure logging
logger = logging.getLogger(__name__)
# Default settings
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_QUEUE_BASE_NAME = 'video_queue'
DEFAULT_MAX_ITEMS_TO_LIST = 25
# Import utility functions
from utils.redis_utils import _get_redis_client
# --- Python Callable for Check and List Task ---
def check_and_list_queue_callable(**context):
"""Checks the type and size of a Redis key and lists its recent contents."""
params = context['params']
redis_conn_id = params['redis_conn_id']
# queue_suffix is passed from the PythonOperator's op_kwargs, which are available in the context
queue_suffix = context['queue_suffix']
queue_name = params.get('queue_name', DEFAULT_QUEUE_BASE_NAME)
queue_to_check = f"{queue_name}{queue_suffix}"
max_items = int(params.get('max_items_to_list', DEFAULT_MAX_ITEMS_TO_LIST))
logger.info(f"--- Checking Status and Contents of Redis Key: '{queue_to_check}' ---")
logger.info(f"Using connection '{redis_conn_id}', listing up to {max_items} items.")
try:
redis_client = _get_redis_client(redis_conn_id)
key_type_bytes = redis_client.type(queue_to_check)
key_type = key_type_bytes.decode('utf-8')
if key_type == 'list':
list_length = redis_client.llen(queue_to_check)
logger.info(f"Redis key '{queue_to_check}' is a LIST with {list_length} items.")
if list_length > 0:
items_to_fetch = min(max_items, list_length)
# lrange with negative indices gets items from the end (most recent for rpush)
contents_bytes = redis_client.lrange(queue_to_check, -items_to_fetch, -1)
contents = [item.decode('utf-8') for item in contents_bytes]
contents.reverse() # Show most recent first
logger.info(f"--- Showing most recent {len(contents)} of {list_length} items ---")
for i, item in enumerate(contents):
logger.info(f" [recent_{i}]: {item}")
if list_length > len(contents):
logger.info(f" ... ({list_length - len(contents)} older items not shown)")
logger.info(f"--- End of List Contents ---")
elif key_type == 'hash':
hash_size = redis_client.hlen(queue_to_check)
logger.info(f"Redis key '{queue_to_check}' is a HASH with {hash_size} fields.")
if hash_size > 0:
logger.info(f"--- Showing a sample of up to {max_items} fields ---")
item_count = 0
# Using hscan_iter to safely iterate over hash fields, count is a hint
for field_bytes, value_bytes in redis_client.hscan_iter(queue_to_check, count=max_items):
if item_count >= max_items:
logger.info(f" ... (stopped listing after {max_items} items of {hash_size})")
break
field = field_bytes.decode('utf-8')
value = value_bytes.decode('utf-8')
# Try to pretty-print if value is JSON
try:
parsed_value = json.loads(value)
# Check for timestamp to show age
timestamp = parsed_value.get('end_time') or parsed_value.get('start_time')
age_str = ""
if timestamp:
age_seconds = (datetime.now(timezone.utc) - datetime.fromtimestamp(timestamp, timezone.utc)).total_seconds()
age_str = f" (age: {timedelta(seconds=age_seconds)})"
pretty_value = json.dumps(parsed_value, indent=2)
logger.info(f" Field '{field}'{age_str}:\n{pretty_value}")
except (json.JSONDecodeError, TypeError):
logger.info(f" Field '{field}': {value}")
item_count += 1
logger.info(f"--- End of Hash Contents ---")
elif key_type == 'none':
logger.info(f"Redis key '{queue_to_check}' does not exist.")
else:
logger.info(f"Redis key '{queue_to_check}' is of type '{key_type}'. Listing contents for this type is not implemented.")
except Exception as e:
logger.error(f"Failed to check/list contents of Redis key '{queue_to_check}': {e}", exc_info=True)
raise AirflowException(f"Failed to process Redis key: {e}")
# --- DAG Definition ---
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 0, # No retries for a manual check/list operation
'start_date': days_ago(1)
}
with DAG(
dag_id='ytdlp_mgmt_queues_check_status',
default_args=default_args,
schedule_interval=None, # Manually triggered
catchup=False,
description='Manually check the status and recent items of all YTDLP Redis queues for a given base name.',
tags=['ytdlp', 'queue', 'management', 'redis', 'manual', 'status', 'list'],
params={
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
'queue_name': Param(
DEFAULT_QUEUE_BASE_NAME,
type="string",
description="Base name for the Redis queues (e.g., 'video_queue')."
),
'max_items_to_list': Param(DEFAULT_MAX_ITEMS_TO_LIST, type="integer", description="Maximum number of recent items/fields to list from each queue."),
}
) as dag:
check_inbox_queue = PythonOperator(
task_id='check_inbox_queue',
python_callable=check_and_list_queue_callable,
op_kwargs={'queue_suffix': '_inbox'},
)
check_inbox_queue.doc_md = """
### Check Inbox Queue (`_inbox`)
Checks the status and lists the most recent URLs waiting to be processed.
The full queue name is `{{ params.queue_name }}_inbox`.
"""
check_progress_queue = PythonOperator(
task_id='check_progress_queue',
python_callable=check_and_list_queue_callable,
op_kwargs={'queue_suffix': '_progress'},
)
check_progress_queue.doc_md = """
### Check Progress Queue (`_progress`)
Checks the status and lists a sample of URLs currently being processed.
The full queue name is `{{ params.queue_name }}_progress`.
"""
check_result_queue = PythonOperator(
task_id='check_result_queue',
python_callable=check_and_list_queue_callable,
op_kwargs={'queue_suffix': '_result'},
)
check_result_queue.doc_md = """
### Check Result Queue (`_result`)
Checks the status and lists a sample of successfully processed URLs.
The full queue name is `{{ params.queue_name }}_result`.
"""
check_fail_queue = PythonOperator(
task_id='check_fail_queue',
python_callable=check_and_list_queue_callable,
op_kwargs={'queue_suffix': '_fail'},
)
check_fail_queue.doc_md = """
### Check Fail Queue (`_fail`)
Checks the status and lists a sample of failed URLs.
The full queue name is `{{ params.queue_name }}_fail`.
"""

View File

@ -0,0 +1,343 @@
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2024 rl <rl@rlmbp>
#
# Distributed under terms of the MIT license.
"""
DAG for processing a single YouTube URL passed via DAG run configuration.
This is the "Worker" part of a Sensor/Worker pattern.
This DAG has been refactored to use the TaskFlow API to implement worker affinity,
ensuring all tasks for a single URL run on the same machine.
"""
from __future__ import annotations
from airflow.decorators import task, task_group
from airflow.exceptions import AirflowException, AirflowSkipException
from airflow.models import Variable
from airflow.models.dag import DAG
from airflow.models.param import Param
from airflow.models.xcom_arg import XComArg
from airflow.operators.dummy import DummyOperator
from airflow.operators.bash import BashOperator
from airflow.utils.dates import days_ago
from airflow.api.common.trigger_dag import trigger_dag
from datetime import timedelta, datetime
import json
import logging
import os
import random
import re
import socket
import time
import traceback
import uuid
import subprocess
import shlex
# Import utility functions and Thrift modules
from utils.redis_utils import _get_redis_client
# Handle potential import issues with Thrift modules
try:
from pangramia.yt.common.ttypes import TokenUpdateMode
except ImportError as e:
logging.warning(f"Could not import TokenUpdateMode from pangramia.yt.common.ttypes: {e}")
TokenUpdateMode = None
try:
from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
except ImportError as e:
logging.warning(f"Could not import PBServiceException/PBUserException from pangramia.yt.exceptions.ttypes: {e}")
PBServiceException = Exception
PBUserException = Exception
try:
from pangramia.yt.tokens_ops import YTTokenOpService
except ImportError as e:
logging.warning(f"Could not import YTTokenOpService from pangramia.yt.tokens_ops: {e}")
YTTokenOpService = None
try:
from thrift.protocol import TBinaryProtocol
from thrift.transport import TSocket, TTransport
from thrift.transport.TTransport import TTransportException
except ImportError as e:
logging.warning(f"Could not import thrift modules: {e}")
TBinaryProtocol = None
TSocket = None
TTransport = None
TTransportException = Exception
# Configure logging
logger = logging.getLogger(__name__)
# Default settings from Airflow Variables or hardcoded fallbacks
DEFAULT_QUEUE_NAME = 'video_queue'
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_TIMEOUT = 3600
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
# The queue is set to a fallback here. The actual worker-specific queue is
# assigned just-in-time by the task_instance_mutation_hook in airflow_local_settings.py,
# which reads the 'worker_queue' from the DAG run configuration.
DEFAULT_ARGS = {
'owner': 'airflow',
'retries': 0,
'queue': 'queue-dl', # Fallback queue. Will be overridden by the policy hook.
}
# --- Helper Functions ---
def _get_thrift_client(host, port, timeout):
"""Helper to create and connect a Thrift client."""
if not TSocket or not TTransport or not TBinaryProtocol:
raise AirflowException("Required Thrift modules are not available")
transport = TSocket.TSocket(host, port)
transport.setTimeout(timeout * 1000)
transport = TTransport.TFramedTransport(transport)
protocol = TBinaryProtocol.TBinaryProtocolFactory()
client = YTTokenOpService.Client(protocol) if YTTokenOpService else None
if client:
transport.open()
logger.info(f"Connected to Thrift server at {host}:{port}")
return client, transport
def _extract_video_id(url):
"""Extracts YouTube video ID from URL."""
if not url or not isinstance(url, str):
return None
patterns = [r'v=([a-zA-Z0-9_-]{11})', r'youtu\.be/([a-zA-Z0-9_-]{11})']
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def _get_account_pool(params: dict) -> list:
"""
Gets the list of accounts to use for processing, filtering out banned/resting accounts.
Supports explicit list, prefix-based generation, and single account modes.
"""
account_pool_str = params.get('account_pool', 'default_account')
accounts = []
is_prefix_mode = False
if ',' in account_pool_str:
accounts = [acc.strip() for acc in account_pool_str.split(',') if acc.strip()]
else:
prefix = account_pool_str
pool_size_param = params.get('account_pool_size')
if pool_size_param is not None:
is_prefix_mode = True
pool_size = int(pool_size_param)
accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
else:
accounts = [prefix]
if not accounts:
raise AirflowException("Initial account pool is empty.")
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
try:
redis_client = _get_redis_client(redis_conn_id)
active_accounts = []
for account in accounts:
status_bytes = redis_client.hget(f"account_status:{account}", "status")
status = status_bytes.decode('utf-8') if status_bytes else "ACTIVE"
if status not in ['BANNED'] and 'RESTING' not in status:
active_accounts.append(account)
if not active_accounts and accounts:
auto_create = params.get('auto_create_new_accounts_on_exhaustion', False)
if auto_create and is_prefix_mode:
new_account_id = f"{account_pool_str}-auto-{str(uuid.uuid4())[:8]}"
logger.warning(f"Account pool exhausted. Auto-creating new account: '{new_account_id}'")
active_accounts.append(new_account_id)
else:
raise AirflowException("All accounts in the configured pool are currently exhausted.")
accounts = active_accounts
except Exception as e:
logger.error(f"Could not filter accounts from Redis. Using unfiltered pool. Error: {e}", exc_info=True)
if not accounts:
raise AirflowException("Account pool is empty after filtering.")
logger.info(f"Final active account pool with {len(accounts)} accounts.")
return accounts
# =============================================================================
# TASK DEFINITIONS (TaskFlow API)
# =============================================================================
@task
def get_url_and_assign_account(**context):
"""
Gets the URL to process from the DAG run configuration and assigns an active account.
This is the first task in the pinned-worker DAG.
"""
params = context['params']
# Update yt-dlp to latest nightly before every run
subprocess.run(["/usr/local/bin/update-yt-dlp.sh"], check=True)
# The URL is passed by the dispatcher DAG.
url_to_process = params.get('url_to_process')
if not url_to_process:
raise AirflowException("'url_to_process' was not found in the DAG run configuration.")
logger.info(f"Received URL '{url_to_process}' to process.")
# Account assignment logic is the same as before.
account_id = random.choice(_get_account_pool(params))
logger.info(f"Selected account '{account_id}' for this run.")
return {
'url_to_process': url_to_process,
'account_id': account_id,
'accounts_tried': [account_id],
}
@task
def get_token(initial_data: dict, **context):
"""Makes a single attempt to get a token from the Thrift service."""
ti = context['task_instance']
params = context['params']
account_id = initial_data['account_id']
url = initial_data['url_to_process']
info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')
host, port, timeout = params['service_ip'], int(params['service_port']), int(params.get('timeout', DEFAULT_TIMEOUT))
machine_id = params.get('machine_id') or socket.gethostname()
logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' ---")
client, transport = None, None
try:
client, transport = _get_thrift_client(host, port, timeout)
if not client or not TokenUpdateMode:
raise AirflowException("Thrift client or TokenUpdateMode not available")
token_data = client.getOrRefreshToken(accountId=account_id, updateType=TokenUpdateMode.AUTO, url=url, clients=params.get('clients'), machineId=machine_id)
info_json = getattr(token_data, 'infoJson', None)
if not (info_json and json.loads(info_json)):
raise AirflowException("Service returned success but info.json was empty or invalid.")
video_id = _extract_video_id(url)
os.makedirs(info_json_dir, exist_ok=True)
# Use a readable timestamp for a unique filename on each attempt.
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
info_json_path = os.path.join(info_json_dir, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json")
with open(info_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
return {
'info_json_path': info_json_path,
'socks_proxy': getattr(token_data, proxy_attr) if proxy_attr else None,
'ytdlp_command': getattr(token_data, proxy_attr) if proxy_attr else None,
'successful_account_id': account_id,
'original_url': url, # Include original URL for fallback
}
except (PBServiceException, PBUserException, TTransportException) as e:
error_context = getattr(e, 'context', None)
if isinstance(error_context, str):
try: error_context = json.loads(error_context.replace("'", "\""))
except: pass
error_details = {
'error_message': getattr(e, 'message', str(e)),
'error_code': getattr(e, 'errorCode', 'TRANSPORT_ERROR'),
'proxy_url': error_context.get('proxy_url') if isinstance(error_context, dict) else None
}
logger.error(f"Thrift call failed for account '{account_id}'. Exception: {error_details['error_message']}")
ti.xcom_push(key='error_details', value=error_details)
# If it's not a connection error, run diagnostic yt-dlp command
if error_details['error_code'] not in ["SOCKS5_CONNECTION_FAILED", "SOCKET_TIMEOUT", "TRANSPORT_ERROR", "CAMOUFOX_TIMEOUT"]:
_run_diagnostic_yt_dlp(url, error_details.get('proxy_url'), params.get('clients', 'web'))
raise AirflowException(f"Thrift call failed: {error_details['error_message']}")
finally:
if transport and transport.isOpen():
transport.close()
def _run_diagnostic_yt_dlp(url, proxy, clients):
"""Runs yt-dlp with diagnostic flags to capture failed responses."""
logger.warning("Running diagnostic yt-dlp command to capture failed response...")
dump_dir = "/opt/airflow/dumps"
os.makedirs(dump_dir, exist_ok=True)
video_id = _extract_video_id(url)
dump_file = os.path.join(dump_dir, f"diagnostic_{video_id}_{int(time.time())}.dump")
cmd = [
'yt-dlp',
'--extractor-args', f'youtube:player-client={clients}',
'--write-pages',
'--proxy', proxy or '',
'-FvU',
url,
'--write-info-json',
'--print', 'filename',
'--continue',
'--no-progress',
'--no-simulate',
'--ignore-errors',
'--no-playlist'
]
logger.info(f"Executing diagnostic command: {' '.join(shlex.quote(arg) for arg in cmd)}")
logger.info(f"Diagnostic dump will be saved to: {dump_file}")
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
logger.info(f"Diagnostic yt-dlp exit code: {result.returncode}")
if result.stdout:
logger.info(f"Diagnostic output:\n{result.stdout}")
if result.stderr:
logger.error(f"Diagnostic stderr:\n{result.stderr}")
except subprocess.TimeoutExpired:
logger.error("Diagnostic yt-dlp command timed out after 5 minutes")
except Exception as e:
logger.error(f"Failed to run diagnostic yt-dlp: {e}")
@task.branch
def handle_bannable_error_branch(task_id_to_check: str, **context):
"""Inspects a failed task and routes to retry logic if the error is bannable."""
ti = context['task_instance']
params = context['params']
error_details = ti.xcom_pull(task_ids=task_id_to_check, key='error_details')
if not error_details:
return None # Let DAG fail for unexpected errors
error_code = error_details.get('error_code', '').strip()
policy = params.get('on_bannable_failure', 'retry_with_new_account')
# Connection errors should be retried without banning the account.
connection_errors = ['SOCKS5_CONNECTION_FAILED', 'SOCKET_TIMEOUT', 'TRANSPORT_ERROR', 'CAMOUFOX_TIMEOUT']
if error_code in connection_errors:
logger.info(f"Handling connection error '{error_code}' from '{task_id_to_check}'. Policy: '{policy}'")
if policy == 'stop_loop':
logger.warning(f"Connection error with 'stop_loop' policy. Failing DAG without banning.")
return None
else:
logger.info("Retrying with a new account without banning.")
return 'assign_new_account_for_retry'
is_bannable = error_code in ["BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED"]
logger.info(f"Handling failure from '{task_id_to_check}'. Error code: '{error_code}', Policy: '{policy}'")
if is_bannable and policy in ['retry_with_new_account', 'retry_and_ban_account_only']:
return 'ban_account_and_prepare_for_retry'
if is_bannable and policy in ['retry_on_connection_error', 'retry_without_ban']:
return 'assign_new_account_for_retry'
if is_bannable: # stop_loop
return 'ban_and_fail'
return None # Not a bannable error, let DAG fail

View File

@ -0,0 +1,707 @@
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2024 rl <rl@rlmbp>
#
# Distributed under terms of the MIT license.
"""
DAG for processing YouTube URLs sequentially from a Redis queue using YTDLP Ops Thrift service.
"""
from airflow import DAG
from airflow.exceptions import AirflowException, AirflowSkipException, AirflowFailException
from airflow.hooks.base import BaseHook
from airflow.models import BaseOperator, Variable
from airflow.models.param import Param
from airflow.operators.bash import BashOperator # Import BashOperator
from airflow.operators.python import PythonOperator
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
from airflow.providers.redis.hooks.redis import RedisHook
from airflow.utils.dates import days_ago
from airflow.utils.decorators import apply_defaults
from datetime import datetime, timedelta
from pangramia.yt.common.ttypes import TokenUpdateMode
from pangramia.yt.exceptions.ttypes import PBServiceException
from pangramia.yt.tokens_ops import YTTokenOpService
from thrift.protocol import TBinaryProtocol
from thrift.transport import TSocket, TTransport
from thrift.transport.TTransport import TTransportException
import json
import logging
import os
import redis # Import redis exceptions if needed
import socket
import time
import traceback # For logging stack traces in failure handler
# Configure logging
logger = logging.getLogger(__name__)
# Default settings
DEFAULT_QUEUE_NAME = 'video_queue' # Base name for queues
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_TIMEOUT = 30 # Default Thrift timeout in seconds
MAX_RETRIES_REDIS_LOOKUP = 3 # Retries for fetching service details from Redis
RETRY_DELAY_REDIS_LOOKUP = 10 # Delay (seconds) for Redis lookup retries
# --- Helper Functions ---
from utils.redis_utils import _get_redis_client
def _extract_video_id(url):
"""Extracts YouTube video ID from URL."""
if not url or not isinstance(url, str):
logger.debug("URL is empty or not a string, cannot extract video ID.")
return None
try:
video_id = None
if 'youtube.com/watch?v=' in url:
video_id = url.split('v=')[1].split('&')[0]
elif 'youtu.be/' in url:
video_id = url.split('youtu.be/')[1].split('?')[0]
if video_id and len(video_id) >= 11:
video_id = video_id[:11] # Standard ID length
logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
return video_id
else:
logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
return None
except Exception as e:
logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
return None
# --- Queue Management Callables ---
def pop_url_from_queue(**context):
"""Pops a URL from the inbox queue and pushes to XCom."""
params = context['params']
queue_name = params['queue_name']
inbox_queue = f"{queue_name}_inbox"
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
logger.info(f"Attempting to pop URL from inbox queue: {inbox_queue}")
try:
client = _get_redis_client(redis_conn_id)
# LPOP is non-blocking, returns None if empty
url_bytes = client.lpop(inbox_queue) # Returns bytes if decode_responses=False on hook/client
if url_bytes:
url = url_bytes.decode('utf-8') if isinstance(url_bytes, bytes) else url_bytes
logger.info(f"Popped URL: {url}")
context['task_instance'].xcom_push(key='current_url', value=url)
return url # Return URL for logging/potential use
else:
logger.info(f"Inbox queue '{inbox_queue}' is empty. Skipping downstream tasks.")
context['task_instance'].xcom_push(key='current_url', value=None)
# Raise AirflowSkipException to signal downstream tasks to skip
raise AirflowSkipException(f"Inbox queue '{inbox_queue}' is empty.")
except AirflowSkipException:
raise # Re-raise skip exception
except Exception as e:
logger.error(f"Error popping URL from Redis queue '{inbox_queue}': {e}", exc_info=True)
raise AirflowException(f"Failed to pop URL from Redis: {e}")
def move_url_to_progress(**context):
"""Moves the current URL from XCom to the progress hash."""
ti = context['task_instance']
url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url')
# This task should be skipped if pop_url_from_queue raised AirflowSkipException
# Adding check for robustness
if not url:
logger.info("No URL found in XCom (or upstream skipped). Skipping move to progress.")
raise AirflowSkipException("No URL to process.")
params = context['params']
queue_name = params['queue_name']
progress_queue = f"{queue_name}_progress"
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
logger.info(f"Moving URL '{url}' to progress hash: {progress_queue}")
progress_data = {
'status': 'processing',
'start_time': time.time(),
'dag_run_id': context['dag_run'].run_id,
'task_instance_key_str': context['task_instance_key_str']
}
try:
client = _get_redis_client(redis_conn_id)
client.hset(progress_queue, url, json.dumps(progress_data))
logger.info(f"Moved URL '{url}' to progress hash '{progress_queue}'.")
except Exception as e:
logger.error(f"Error moving URL to Redis progress hash '{progress_queue}': {e}", exc_info=True)
# If this fails, the URL is popped but not tracked as processing. Fail the task.
raise AirflowException(f"Failed to move URL to progress hash: {e}")
def handle_success(**context):
"""Moves URL from progress to result hash on success."""
ti = context['task_instance']
url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url')
if not url:
logger.warning("handle_success called but no URL found from pop_url_from_queue XCom. This shouldn't happen on success path.")
return # Or raise error
params = context['params']
queue_name = params['queue_name']
progress_queue = f"{queue_name}_progress"
result_queue = f"{queue_name}_result"
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
# Pull results from get_token task
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command') # Original command
downloaded_file_path = ti.xcom_pull(task_ids='download_video') # Pull from download_video task
logger.info(f"Handling success for URL: {url}")
logger.info(f" Info JSON Path: {info_json_path}")
logger.info(f" SOCKS Proxy: {socks_proxy}")
logger.info(f" YTDLP Command: {ytdlp_command[:100] if ytdlp_command else 'None'}...") # Log truncated command
logger.info(f" Downloaded File Path: {downloaded_file_path}")
result_data = {
'status': 'success',
'end_time': time.time(),
'info_json_path': info_json_path,
'socks_proxy': socks_proxy,
'ytdlp_command': ytdlp_command,
'downloaded_file_path': downloaded_file_path,
'url': url,
'dag_run_id': context['dag_run'].run_id,
'task_instance_key_str': context['task_instance_key_str'] # Record which task instance succeeded
}
try:
client = _get_redis_client(redis_conn_id)
# Remove from progress hash
removed_count = client.hdel(progress_queue, url)
if removed_count > 0:
logger.info(f"Removed URL '{url}' from progress hash '{progress_queue}'.")
else:
logger.warning(f"URL '{url}' not found in progress hash '{progress_queue}' during success handling.")
# Add to result hash
client.hset(result_queue, url, json.dumps(result_data))
logger.info(f"Stored success result for URL '{url}' in result hash '{result_queue}'.")
except Exception as e:
logger.error(f"Error handling success in Redis for URL '{url}': {e}", exc_info=True)
# Even if Redis fails, the task succeeded. Log error but don't fail the task.
# Consider adding retry logic for Redis operations here or marking state differently.
def handle_failure(**context):
"""
Handles failed processing. Depending on the `requeue_on_failure` parameter,
it either moves the URL to the fail hash or re-queues it in the inbox.
If `stop_on_failure` is True, this task will fail, stopping the DAG loop.
"""
ti = context['task_instance']
url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url')
if not url:
logger.error("handle_failure called but no URL found from pop_url_from_queue XCom.")
return
params = context['params']
queue_name = params['queue_name']
progress_queue = f"{queue_name}_progress"
fail_queue = f"{queue_name}_fail"
inbox_queue = f"{queue_name}_inbox"
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
requeue_on_failure = params.get('requeue_on_failure', False)
stop_on_failure = params.get('stop_on_failure', True) # Default to True
exception = context.get('exception')
error_message = str(exception) if exception else "Unknown error"
tb_str = traceback.format_exc() if exception else "No traceback available."
logger.info(f"Handling failure for URL: {url}")
logger.error(f" Failure Reason: {error_message}")
logger.debug(f" Traceback:\n{tb_str}")
try:
client = _get_redis_client(redis_conn_id)
# Always remove from progress hash first
removed_count = client.hdel(progress_queue, url)
if removed_count > 0:
logger.info(f"Removed URL '{url}' from progress hash '{progress_queue}'.")
else:
logger.warning(f"URL '{url}' not found in progress hash '{progress_queue}' during failure handling.")
if requeue_on_failure:
# Re-queue the URL for another attempt
client.rpush(inbox_queue, url)
logger.info(f"Re-queued failed URL '{url}' to inbox '{inbox_queue}' for retry.")
else:
# Move to the permanent fail hash
fail_data = {
'status': 'failed',
'end_time': time.time(),
'error': error_message,
'traceback': tb_str,
'url': url,
'dag_run_id': context['dag_run'].run_id,
'task_instance_key_str': context['task_instance_key_str']
}
client.hset(fail_queue, url, json.dumps(fail_data))
logger.info(f"Stored failure details for URL '{url}' in fail hash '{fail_queue}'.")
except Exception as e:
logger.error(f"Error during failure handling in Redis for URL '{url}': {e}", exc_info=True)
# This is a critical error in the failure handling logic itself.
raise AirflowException(f"Could not handle failure in Redis: {e}")
# After handling Redis, decide whether to fail the task to stop the loop
if stop_on_failure:
logger.error("stop_on_failure is True. Failing this task to stop the DAG loop.")
# Re-raise the original exception to fail the task instance.
# This is better than AirflowFailException because it preserves the original error.
if exception:
raise exception
else:
# If for some reason there's no exception, fail explicitly.
raise AirflowFailException("Failing task as per stop_on_failure=True, but original exception was not found.")
# --- YtdlpOpsOperator ---
class YtdlpOpsOperator(BaseOperator):
"""
Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections
and Redis-based discovery, retrieves tokens, saves info.json, and manages errors.
Modified to pull URL from XCom for sequential processing.
"""
# Removed 'url' from template_fields as it's pulled from XCom
template_fields = ('service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir', 'redis_conn_id')
@apply_defaults
def __init__(self,
# url parameter removed - will be pulled from XCom
redis_conn_id=DEFAULT_REDIS_CONN_ID,
max_retries_lookup=MAX_RETRIES_REDIS_LOOKUP,
retry_delay_lookup=RETRY_DELAY_REDIS_LOOKUP,
service_ip=None,
service_port=None,
redis_enabled=False, # Default to direct connection now
account_id=None,
# save_info_json removed, always True
info_json_dir=None,
# get_socks_proxy removed, always True
# store_socks_proxy removed, always True
# get_socks_proxy=True, # Removed
# store_socks_proxy=True, # Store proxy in XCom by default # Removed
timeout=DEFAULT_TIMEOUT,
*args, **kwargs):
super().__init__(*args, **kwargs)
logger.info(f"Initializing YtdlpOpsOperator (Processor Version) with parameters: "
f"redis_conn_id={redis_conn_id}, max_retries_lookup={max_retries_lookup}, retry_delay_lookup={retry_delay_lookup}, "
f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, "
f"account_id={account_id}, info_json_dir={info_json_dir}, timeout={timeout}")
# save_info_json, get_socks_proxy, store_socks_proxy removed from log
# Validate parameters based on connection mode
if redis_enabled:
# If using Redis, account_id is essential for lookup
if not account_id:
raise ValueError("account_id is required when redis_enabled=True for service lookup.")
else:
# If direct connection, IP and Port are essential
if not service_ip or not service_port:
raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False.")
# Account ID is still needed for the API call itself, but rely on DAG param or operator config
if not account_id:
logger.warning("No account_id provided for direct connection mode. Ensure it's set in DAG params or operator config.")
# We won't assign 'default' here, let the value passed during instantiation be used.
# self.url is no longer needed here
self.redis_conn_id = redis_conn_id
self.max_retries_lookup = max_retries_lookup
self.retry_delay_lookup = int(retry_delay_lookup.total_seconds() if isinstance(retry_delay_lookup, timedelta) else retry_delay_lookup)
self.service_ip = service_ip
self.service_port = service_port
self.redis_enabled = redis_enabled
self.account_id = account_id
# self.save_info_json removed
self.info_json_dir = info_json_dir # Still needed
# self.get_socks_proxy removed
# self.store_socks_proxy removed
self.timeout = timeout
def execute(self, context):
logger.info("Executing YtdlpOpsOperator (Processor Version)")
transport = None
ti = context['task_instance'] # Get task instance for XCom access
try:
# --- Get URL from XCom ---
url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url')
if not url:
# This should ideally be caught by upstream skip, but handle defensively
logger.info("No URL found in XCom from pop_url_from_queue. Skipping execution.")
raise AirflowSkipException("Upstream task did not provide a URL.")
logger.info(f"Processing URL from XCom: {url}")
# --- End Get URL ---
logger.info("Getting task parameters and rendering templates")
params = context['params'] # DAG run params
# Render template fields using context
# Use render_template_as_native for better type handling if needed, else render_template
redis_conn_id = self.render_template(self.redis_conn_id, context)
service_ip = self.render_template(self.service_ip, context)
service_port_rendered = self.render_template(self.service_port, context)
account_id = self.render_template(self.account_id, context)
timeout_rendered = self.render_template(self.timeout, context)
info_json_dir = self.render_template(self.info_json_dir, context) # Rendered here for _save_info_json
# Determine effective settings (DAG params override operator defaults)
redis_enabled = params.get('redis_enabled', self.redis_enabled)
account_id = params.get('account_id', account_id) # Use DAG param if provided
redis_conn_id = params.get('redis_conn_id', redis_conn_id) # Use DAG param if provided
logger.info(f"Effective settings: redis_enabled={redis_enabled}, account_id='{account_id}', redis_conn_id='{redis_conn_id}'")
host = None
port = None
if redis_enabled:
# Get Redis connection using the helper for consistency
redis_client = _get_redis_client(redis_conn_id)
logger.info(f"Successfully connected to Redis using connection '{redis_conn_id}' for service discovery.")
# Get service details from Redis with retries
service_key = f"ytdlp:{account_id}"
legacy_key = account_id # For backward compatibility
for attempt in range(self.max_retries_lookup):
try:
logger.info(f"Attempt {attempt + 1}/{self.max_retries_lookup}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'")
service_details = redis_client.hgetall(service_key)
if not service_details:
logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'")
service_details = redis_client.hgetall(legacy_key)
if not service_details:
raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}")
# Find IP and port (case-insensitive keys)
ip_key = next((k for k in service_details if k.lower() == 'ip'), None)
port_key = next((k for k in service_details if k.lower() == 'port'), None)
if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}")
if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}")
host = service_details[ip_key] # Assumes decode_responses=True in hook
port_str = service_details[port_key]
try:
port = int(port_str)
except (ValueError, TypeError):
raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}")
logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}")
break # Success
except Exception as e:
logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}")
if attempt == self.max_retries_lookup - 1:
logger.error("Max retries reached for fetching Redis details.")
raise AirflowException(f"Failed to get service details from Redis after {self.max_retries_lookup} attempts: {e}")
logger.info(f"Retrying in {self.retry_delay_lookup} seconds...")
time.sleep(self.retry_delay_lookup)
else:
# Direct connection: Use rendered/param values
host = params.get('service_ip', service_ip) # Use DAG param if provided
port_str = params.get('service_port', service_port_rendered) # Use DAG param if provided
logger.info(f"Using direct connection settings: service_ip={host}, service_port={port_str}")
if not host or not port_str:
raise ValueError("Direct connection requires service_ip and service_port (check Operator config and DAG params)")
try:
port = int(port_str)
except (ValueError, TypeError):
raise ValueError(f"Invalid service_port value: {port_str}")
logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)")
# Validate and use timeout
try:
timeout = int(timeout_rendered)
if timeout <= 0: raise ValueError("Timeout must be positive")
logger.info(f"Using timeout: {timeout} seconds")
except (ValueError, TypeError):
logger.warning(f"Invalid timeout value: '{timeout_rendered}'. Using default: {DEFAULT_TIMEOUT}")
timeout = DEFAULT_TIMEOUT
# Create Thrift connection objects
# socket_conn = TSocket.TSocket(host, port) # Original
socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4)
socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds
transport = TTransport.TFramedTransport(socket_conn) # Use TFramedTransport if server expects it
# transport = TTransport.TBufferedTransport(socket_conn) # Use TBufferedTransport if server expects it
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = YTTokenOpService.Client(protocol)
logger.info(f"Attempting to connect to Thrift server at {host}:{port}...")
try:
transport.open()
logger.info("Successfully connected to Thrift server.")
# Test connection with ping
try:
client.ping()
logger.info("Server ping successful.")
except Exception as e:
logger.error(f"Server ping failed: {e}")
raise AirflowException(f"Server connection test (ping) failed: {e}")
# Get token from service using the URL from XCom
try:
logger.info(f"Requesting token for accountId='{account_id}', url='{url}'")
token_data = client.getOrRefreshToken(
accountId=account_id,
updateType=TokenUpdateMode.AUTO,
url=url # Use the url variable from XCom
)
logger.info("Successfully retrieved token data from service.")
except PBServiceException as e:
# Handle specific service exceptions
error_code = getattr(e, 'errorCode', 'N/A')
error_message = getattr(e, 'message', 'N/A')
error_context = getattr(e, 'context', {})
logger.error(f"PBServiceException occurred: Code={error_code}, Message={error_message}")
if error_context:
logger.error(f" Context: {error_context}") # Log context separately
# Construct a concise error message for AirflowException
error_msg = f"YTDLP service error (Code: {error_code}): {error_message}"
# Add specific error code handling if needed...
logger.error(f"Failing task instance due to PBServiceException: {error_msg}") # Add explicit log before raising
raise AirflowException(error_msg) # Fail task on service error
except TTransportException as e:
logger.error(f"Thrift transport error during getOrRefreshToken: {e}")
logger.error(f"Failing task instance due to TTransportException: {e}") # Add explicit log before raising
raise AirflowException(f"Transport error during API call: {e}")
except Exception as e:
logger.error(f"Unexpected error during getOrRefreshToken: {e}")
logger.error(f"Failing task instance due to unexpected error during API call: {e}") # Add explicit log before raising
raise AirflowException(f"Unexpected error during API call: {e}")
except TTransportException as e:
# Handle connection errors
logger.error(f"Thrift transport error during connection: {str(e)}")
logger.error(f"Failing task instance due to TTransportException during connection: {e}") # Add explicit log before raising
raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}")
# Removed the overly broad except Exception block here, as inner blocks raise AirflowException
# --- Process Token Data ---
logger.debug(f"Token data received. Attributes: {dir(token_data)}")
info_json_path = None # Initialize
# save_info_json is now always True
logger.info("Proceeding to save info.json (save_info_json=True).")
info_json = self._get_info_json(token_data)
if info_json and self._is_valid_json(info_json):
try:
# Pass rendered info_json_dir to helper
info_json_path = self._save_info_json(context, info_json, url, account_id, info_json_dir)
if info_json_path:
ti.xcom_push(key='info_json_path', value=info_json_path)
logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}")
else:
ti.xcom_push(key='info_json_path', value=None)
logger.warning("info.json saving failed (check logs from _save_info_json).")
except Exception as e:
logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True)
ti.xcom_push(key='info_json_path', value=None)
elif info_json:
logger.warning("Retrieved infoJson is not valid JSON. Skipping save.")
ti.xcom_push(key='info_json_path', value=None)
else:
logger.info("No infoJson found in token data. Skipping save.")
ti.xcom_push(key='info_json_path', value=None)
# Extract and potentially store SOCKS proxy
# get_socks_proxy and store_socks_proxy are now always True
socks_proxy = None
logger.info("Attempting to extract SOCKS proxy (get_socks_proxy=True).")
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
if proxy_attr:
socks_proxy = getattr(token_data, proxy_attr)
if socks_proxy:
logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
# Always store if found (store_socks_proxy=True)
ti.xcom_push(key='socks_proxy', value=socks_proxy)
logger.info("Pushed 'socks_proxy' to XCom.")
else:
logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty.")
# Store None if attribute found but empty
ti.xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.")
else:
logger.info("No SOCKS proxy attribute found in token data.")
# Store None if attribute not found
ti.xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.")
# --- Removed old logic block ---
# # Extract and potentially store SOCKS proxy
# socks_proxy = None
# get_socks_proxy = params.get('get_socks_proxy', self.get_socks_proxy)
# store_socks_proxy = params.get('store_socks_proxy', self.store_socks_proxy)
#
# if get_socks_proxy:
# proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
# if proxy_attr:
# socks_proxy = getattr(token_data, proxy_attr)
# if socks_proxy:
# logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
# if store_socks_proxy:
# ti.xcom_push(key='socks_proxy', value=socks_proxy)
# logger.info("Pushed 'socks_proxy' to XCom.")
# else:
# logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty.")
# if store_socks_proxy: ti.xcom_push(key='socks_proxy', value=None)
# else:
# logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found.")
# if store_socks_proxy: ti.xcom_push(key='socks_proxy', value=None)
# else:
# logger.info("get_socks_proxy is False. Skipping proxy extraction.")
# if store_socks_proxy: ti.xcom_push(key='socks_proxy', value=None)
# --- End Removed old logic block ---
# Get the original command from the server, or construct a fallback
ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
if ytdlp_cmd:
logger.info(f"Original command received from server: {ytdlp_cmd[:100]}...") # Log truncated
else:
logger.warning("No 'ytdlpCommand' attribute found in token data. Constructing a fallback for logging.")
# Construct a representative command for logging purposes
if socks_proxy:
ytdlp_cmd = f"yt-dlp --dump-json --proxy \"{socks_proxy}\" \"{url}\""
else:
ytdlp_cmd = f"yt-dlp --dump-json \"{url}\""
logger.info(f"Constructed fallback command: {ytdlp_cmd}")
# Push the command to XCom
ti.xcom_push(key='ytdlp_command', value=ytdlp_cmd)
logger.info("Pushed command to XCom key 'ytdlp_command'.")
# No explicit return needed, success is implicit if no exception raised
except (AirflowSkipException, AirflowFailException) as e:
logger.info(f"Task skipped or failed explicitly: {e}")
raise # Re-raise to let Airflow handle state
except AirflowException as e: # Catch AirflowExceptions raised explicitly
logger.error(f"Operation failed due to AirflowException: {e}", exc_info=True)
raise # Re-raise AirflowExceptions to ensure task failure
except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already handled inside inner try
logger.error(f"Unhandled YTDLP Service/Transport error in outer block: {e}", exc_info=True)
logger.error(f"Failing task instance due to unhandled outer Service/Transport error: {e}") # Add explicit log before raising
raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException to fail task
except Exception as e: # General catch-all for truly unexpected errors
logger.error(f"Caught unexpected error in YtdlpOpsOperator outer block: {e}", exc_info=True)
logger.error(f"Failing task instance due to unexpected outer error: {e}") # Add explicit log before raising
raise AirflowException(f"Unexpected error caused task failure: {e}") # Wrap to fail task
finally:
if transport and transport.isOpen():
logger.info("Closing Thrift transport.")
transport.close()
# --- Helper Methods ---
def _get_info_json(self, token_data):
"""Safely extracts infoJson from token data."""
return getattr(token_data, 'infoJson', None)
def _is_valid_json(self, json_str):
"""Checks if a string is valid JSON."""
if not json_str or not isinstance(json_str, str): return False
try:
json.loads(json_str)
return True
except json.JSONDecodeError:
return False
def _save_info_json(self, context, info_json, url, account_id, rendered_info_json_dir):
"""Saves info_json to a file. Uses pre-rendered directory path."""
try:
video_id = _extract_video_id(url) # Use standalone helper
save_dir = rendered_info_json_dir or "." # Use rendered path
logger.info(f"Target directory for info.json: {save_dir}")
# Ensure directory exists
try:
os.makedirs(save_dir, exist_ok=True)
logger.info(f"Ensured directory exists: {save_dir}")
except OSError as e:
logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.")
return None
# Construct filename
timestamp = int(time.time())
base_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json"
info_json_path = os.path.join(save_dir, base_filename)
latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy
# Write to timestamped file
try:
logger.info(f"Writing info.json content (received from service) to {info_json_path}...")
with open(info_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}")
except IOError as e:
logger.error(f"Failed to write info.json to {info_json_path}: {e}")
return None
# Write to latest.json (overwrite) - best effort
try:
with open(latest_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
logger.info(f"Updated latest.json file: {latest_json_path}")
except IOError as e:
logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}")
return info_json_path
except Exception as e:
logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True)
return None
# =============================================================================
# DAG Definition
# =============================================================================
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1, # Default retries for tasks like queue management
'retry_delay': timedelta(minutes=1),
'start_date': days_ago(1),
# Add concurrency control if needed for sequential processing
# 'concurrency': 1, # Ensure only one task instance runs at a time per DAG run
# 'max_active_runs': 1, # Ensure only one DAG run is active
}
# Define DAG
#
# --- DAG Block Deactivated on 2025-07-16 ---
# This DAG has been replaced by the Sensor/Worker pattern implemented in:
# - ytdlp_sensor_redis_queue.py (polls the queue)
# - ytdlp_worker_per_url.py (processes a single URL)
# This code is kept for reference but is not active.
#

View File

@ -0,0 +1,974 @@
"""
DAG to deploy and manage YTDLP token service.
This DAG handles the deployment, monitoring, and cleanup of a YTDLP token service
for a given account. It supports both Redis-based service discovery and direct
connection via manually specified host and port.
Configuration Options:
- account_id: (Required) The account ID for which the service is being deployed.
- proxy: (Optional) The proxy to use for the service.
- redis_enabled: (Optional, default=True) Whether to use Redis for service discovery.
If False, you must provide `host` and `port` manually.
- host: (Optional) The host IP of the service. Required if `redis_enabled=False`.
- port: (Optional) The port of the service. Required if `redis_enabled=False`.
Usage:
1. Redis-based service discovery:
- Set `redis_enabled=True` (default).
- Ensure Redis is configured in Airflow connections.
- The DAG will automatically discover the service IP and port from Redis.
2. Manual host and port:
- Set `redis_enabled=False`.
- Provide `host` and `port` manually in the DAG configuration.
- Example: {"host": "192.168.1.100", "port": 9090}.
Example Trigger Configuration:
{
"account_id": "test_account",
"proxy": "socks5://proxy.example.com:1080",
"redis_enabled": False,
"host": "192.168.1.100",
"port": 9090
}
"""
from airflow import DAG
from airflow.models.param import Param
from airflow.operators.empty import EmptyOperator
from airflow.operators.python import PythonOperator
# HttpSensor is no longer used
# from airflow.providers.http.sensors.http import HttpSensor
from airflow.utils.trigger_rule import TriggerRule
from airflow.hooks.base import BaseHook
from airflow.exceptions import AirflowException
from typing import Sequence # Add Sequence for type hinting
from datetime import datetime, timedelta
from airflow.utils.dates import days_ago # Add this import
import uuid
import os
import logging
import shutil
import docker
import uuid
import redis
import requests
import socket
import time
import sys # Import sys for maxsize
from airflow.configuration import conf # Import conf
# Import and apply Thrift exceptions patch
try:
# Always apply the patch, regardless of environment
from thrift_exceptions_patch import patch_thrift_exceptions
patch_thrift_exceptions()
logging.info("Applied Thrift exceptions patch for Airflow compatibility")
# Verify the patch was applied correctly
try:
from pangramia.yt.exceptions.ttypes import PBServiceException
test_exception = PBServiceException(message="Test")
# Try to modify attributes to verify patch works
test_exception.args = ("Test",)
test_exception.message = "Modified test"
logging.info("Verified Thrift exception patch is working correctly")
except Exception as verify_error:
logging.error(f"Thrift exception patch verification failed: {verify_error}")
logging.error("This may cause 'immutable instance' errors during error handling")
except ImportError as e:
logging.warning(f"Could not import thrift_exceptions_patch: {e}")
logging.warning("Airflow compatibility will be affected - expect 'immutable instance' errors")
except Exception as e:
logging.error(f"Error applying Thrift exceptions patch: {e}")
# Default arguments for the DAG
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 0, # Disable retries for all tasks in this DAG
'retry_delay': timedelta(minutes=5),
# Removed 'queue': 'auth_queue' to use the default queue
# Optional: Further filter workers by tags if using CeleryExecutor
'executor_config': {"CeleryExecutor": {"tags": ["auth_node"]}},
}
def get_redis_connection(redis_host=None, redis_port=None):
"""Get a Redis connection using Airflow's Redis connection or manually specified host/port."""
if redis_host and redis_port:
# Use manually specified host and port
return redis.Redis(
host=redis_host,
port=redis_port,
db=0,
decode_responses=True
)
else:
# Use Airflow's Redis connection
redis_conn = BaseHook.get_connection("redis_default")
# Use the password from the connection if available, otherwise use 'airflow' as default
password = redis_conn.password or 'airflow'
return redis.Redis(
host=redis_conn.host, # 'redis' (service name in docker-compose)
port=redis_conn.port, # 6379
password=password,
db=0,
decode_responses=True
)
def get_free_port():
"""Find and return a free port."""
import socket
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('0.0.0.0', 0))
return s.getsockname()[1]
def is_port_free(p):
"""Check if a port is free to use."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(('0.0.0.0', p))
return True
except OSError:
return False
def store_account_metadata(account_id, ip, port, proxy=None, health_port=None, container_id=None):
"""Store account metadata in Redis."""
redis_client = get_redis_connection()
try:
# Verify Redis connection
if not redis_client.ping():
raise ConnectionError("Failed to connect to Redis")
# Store main account metadata
mapping = {
"ip": ip,
"port": str(port),
"status": "running",
"start_time": str(time.time())
}
if proxy:
mapping["proxy"] = proxy
if health_port:
mapping["health_port"] = str(health_port)
if container_id:
mapping["container_id"] = container_id
# Use pipeline for atomic operations
with redis_client.pipeline() as pipe:
# Store main metadata
pipe.hset(f"ytdlp:{account_id}", mapping=mapping)
# Set expiration (1 week)
pipe.expire(f"ytdlp:{account_id}", 604800)
# Add to account list
pipe.sadd("ytdlp:accounts", account_id)
# Execute all commands
results = pipe.execute()
# Verify all commands succeeded
if not all(results):
raise RuntimeError(f"Failed to store metadata for {account_id}. Pipeline results: {results}")
# Verify the data was actually stored
stored_data = redis_client.hgetall(f"ytdlp:{account_id}")
if not stored_data:
raise RuntimeError(f"Failed to verify stored data for {account_id}")
logging.info(f"Successfully stored account metadata for {account_id} in Redis: {stored_data}")
return True
except Exception as e:
logging.error(f"Failed to store account metadata for {account_id}: {e}", exc_info=True)
# Attempt cleanup if storage failed
try:
redis_client = get_redis_connection() # Ensure client is available
redis_client.delete(f"ytdlp:{account_id}")
redis_client.srem("ytdlp:accounts", account_id)
except Exception as cleanup_error:
logging.error(f"Failed to cleanup failed storage for {account_id}: {cleanup_error}")
raise
# Removed get_account_metadata function as the service now handles Redis registration checks.
def prepare_and_deploy_service(**context):
"""Prepare deployment and deploy the Docker service."""
# Retrieve account_id, proxy, clients, and other parameters from DAG run configuration (conf)
# Set default values for account_id, proxy, and redis_enabled
account_id = context['dag_run'].conf.get('account_id') or context['params'].get('account_id', 'account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09')
proxy = context['dag_run'].conf.get('proxy') or context['params'].get('proxy', 'socks5://sslocal-rust-1084:1084')
clients = context['dag_run'].conf.get('clients') or context['params'].get('clients', 'ios,android,mweb')
redis_enabled = context['dag_run'].conf.get('redis_enabled', False) # Default to False
host_param = context['dag_run'].conf.get('host') # Host parameter from config
port_param = context['dag_run'].conf.get('port') # Port parameter from config
docker_network = context['dag_run'].conf.get('docker_network') or context['params'].get('docker_network', 'airflow_prod_proxynet')
host_external_ip_env = os.getenv('HOST_EXTERNAL_IP') # Explicit external IP from environment
if not account_id:
raise ValueError("Account ID is missing.")
# --- Port Determination ---
# Assign a free port if not provided, or validate the provided one
if not port_param:
port = get_free_port()
if not is_port_free(port):
raise ValueError(f"Assigned port {port} is already in use")
logging.info(f"No port provided, assigned free port: {port}")
else:
port = int(port_param)
if not is_port_free(port):
raise ValueError(f"Provided port {port} is already in use")
logging.info(f"Using provided port: {port}")
# Determine health port
health_port = port + 1
if not is_port_free(health_port):
raise ValueError(f"Health port {health_port} (derived from port {port}) is already in use")
logging.info(f"Using health port: {health_port}")
# --- Host Determination ---
# host_for_registration: IP/Host for client discovery (Redis/Logs)
# host_for_sensor: Hostname/IP for Airflow HttpSensor health check
host_for_registration = host_param # Start with the parameter value
if redis_enabled:
# If Redis is enabled, registration host should ideally be externally reachable
if not host_for_registration:
host_for_registration = host_external_ip_env # Use external IP from env var if available
if not host_for_registration:
# If no env var, try fetching external IP using requests
try:
logging.info("HOST_EXTERNAL_IP not set. Attempting to fetch external IP from api.ipify.org...")
response = requests.get('https://api.ipify.org', timeout=10) # 10 second timeout
response.raise_for_status() # Raise exception for bad status codes
host_for_registration = response.text.strip()
if not host_for_registration: # Check if response was empty
raise ValueError("Received empty response from api.ipify.org")
logging.info(f"Successfully fetched external IP: {host_for_registration}")
except requests.exceptions.RequestException as e:
logging.warning(f"Failed to fetch external IP: {e}. Falling back to Docker bridge IP.")
# Fallback to default Docker bridge IP if fetching fails
host_for_registration = "172.17.0.1"
logging.warning(f"Defaulting registration host to Docker bridge IP: {host_for_registration}. Ensure clients can reach this IP.")
except Exception as e:
logging.error(f"Unexpected error fetching external IP: {e}. Falling back to Docker bridge IP.")
host_for_registration = "172.17.0.1"
logging.warning(f"Defaulting registration host to Docker bridge IP: {host_for_registration}. Ensure clients can reach this IP.")
else:
logging.info(f"Redis enabled. Using HOST_EXTERNAL_IP environment variable for registration: {host_for_registration}")
else:
logging.info(f"Redis enabled. Using provided host parameter for registration: {host_for_registration}")
else: # Redis disabled
# If Redis is disabled, registration host defaults to 0.0.0.0 if not provided
if not host_for_registration:
host_for_registration = "0.0.0.0"
logging.warning(f"Redis disabled and no host param provided. Defaulting registration host to {host_for_registration}.")
else:
logging.info(f"Redis disabled. Using provided host parameter for registration: {host_for_registration}")
# host_for_sensor determination will happen *after* container creation, using container name.
logging.info(f"Preparing deployment for account {account_id}. Registration Host: {host_for_registration}, Port: {port}, Health Port: {health_port}")
# Generate unique work ID and context directory
work_id = str(uuid.uuid4())
context['task_instance'].xcom_push(key='work_id', value=work_id)
context_dir = os.path.join(os.getenv('AIRFLOW_HOME', '/tmp'), 'service-data', work_id, 'context-data')
os.makedirs(context_dir, exist_ok=True, mode=0o777)
os.chmod(context_dir, 0o777)
# Push context directory and account details to XCom
context['task_instance'].xcom_push(key='context_dir', value=context_dir)
context['task_instance'].xcom_push(key='account_id', value=account_id)
# Deploy the Docker service
# The 'host_for_registration' variable here represents the externally accessible IP for registration/XCom.
# The service inside the container will listen on 0.0.0.0.
logging.info(f"Deploying service for account {account_id}. Registration Host: {host_for_registration}, Port: {port}")
# Get Redis connection details ONLY if redis_enabled (for the container to register itself)
redis_host_for_container = ''
redis_port_for_container = ''
redis_password_for_container = ''
if redis_enabled:
try:
# Get connection details to pass to the container environment
redis_conn_details = get_redis_connection().connection_pool.connection_kwargs
redis_host_for_container = os.getenv('REDIS_HOST', redis_conn_details.get('host', 'redis'))
redis_port_for_container = str(os.getenv('REDIS_PORT', redis_conn_details.get('port', 6379)))
redis_password_for_container = os.getenv('REDIS_PASSWORD', redis_conn_details.get('password', ''))
logging.info(f"Redis enabled. Passing REDIS_HOST={redis_host_for_container}, REDIS_PORT={redis_port_for_container} to container.")
except Exception as e:
logging.error(f"Failed to get Redis connection details for container environment: {e}")
logging.warning("Proceeding without Redis details in container environment due to error.")
# Depending on container requirements, you might want to raise an error here instead
else:
logging.info("Redis disabled. Not passing REDIS_HOST/REDIS_PORT to container environment.")
# Get Docker connection details from Airflow
try:
secrets_backend = conf.get('secrets', 'backend', fallback='None')
logging.info(f"Attempting to get 'docker_hub' connection. Configured secrets backend: {secrets_backend}")
docker_conn = BaseHook.get_connection("docker_hub")
docker_username = docker_conn.login
docker_password = docker_conn.password
logging.info("Successfully retrieved 'docker_hub' connection.")
except Exception as e:
logging.error(f"Failed to retrieve 'docker_hub' connection: {e}")
# Log details about potential secrets backend issues
secrets_backend_kwargs = conf.get('secrets', 'backend_kwargs', fallback='{}')
logging.error(f"Secrets backend details: backend={secrets_backend}, kwargs={secrets_backend_kwargs}")
# Re-raise the exception to fail the task
raise
try:
# Initialize Docker client to connect to docker-socket-proxy
client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
# Authenticate with Docker Hub
client.login(
username=docker_username,
password=docker_password,
registry=docker_conn.host # Typically "https://index.docker.io/v1/"
)
# Generate a unique container name
container_name = f"ytdlp_service_{account_id}_{uuid.uuid4().hex[:8]}"
# Pull the Docker image (if not already present)
client.images.pull('pangramia/ytdlp-ops-server:latest')
# Use the configured network name (from params or default)
network_name = docker_network # Use the retrieved parameter
logging.info(f"Attempting to run container on network: {network_name}")
# Determine if --probe flag should be added based on DAG param
exit_on_proxy_fail = context['dag_run'].conf.get('exit_on_proxy_fail', True) # Default to True if not set
command_args = [
'--script-dir', '/app/scripts',
'--context-dir', '/app/context-data', # Use the bind mount target inside container
'--port', str(port),
'--health-port', str(health_port),
'--clients', clients,
'--timeout', '120',
'--proxy', proxy if proxy else '',
'--server-identity', account_id, # Use account_id as server identity
]
if redis_enabled:
command_args.extend(['--redis-host', redis_host_for_container])
command_args.extend(['--redis-port', redis_port_for_container])
if exit_on_proxy_fail:
command_args.append('--probe')
logging.info("Adding --probe flag to container command as exit_on_proxy_fail=True")
else:
logging.info("Not adding --probe flag to container command as exit_on_proxy_fail=False")
# Run the Docker container with health port
container = client.containers.run(
image='pangramia/ytdlp-ops-server:latest',
command=command_args, # Use the constructed command list
environment={
'PYTHONUNBUFFERED': '1', # Ensure logs are not buffered
'SERVER_PORT': str(port), # Port the service listens on *inside* the container
'SERVER_HOST': '0.0.0.0', # Service should listen on all interfaces *inside* the container
'ACCOUNT_ID': account_id,
# Pass Redis details *if enabled* for the service to register itself
'REDIS_HOST': redis_host_for_container,
'REDIS_PORT': redis_port_for_container,
'REDIS_PASSWORD': redis_password_for_container,
# Pass PROXY_URL for health check access
'PROXY_URL': proxy if proxy else '',
},
ports={
f"{port}/tcp": port,
f"{health_port}/tcp": health_port
},
volumes={
context_dir: {'bind': '/app/context-data', 'mode': 'rw'}
},
network_mode=network_name, # Use the specified network variable
auto_remove=False, # Do not auto-remove the container
name=container_name, # Use a unique name
detach=True,
tty=True,
shm_size='256m',
# Updated healthcheck to test external connectivity via proxy
healthcheck={
# Use CMD-SHELL to allow conditional logic based on PROXY_URL env var
'test': [
'CMD-SHELL',
# Script checks if PROXY_URL is set, uses it with curl if yes, otherwise curls directly.
# -f: Fail silently (exit non-zero on error)
# --connect-timeout 10: Timeout for connection phase
# > /dev/null: Discard output, we only care about exit code
'if [ -n "$PROXY_URL" ]; then '
'curl -f --connect-timeout 10 -x "$PROXY_URL" https://ifconfig.co > /dev/null; '
'else '
'curl -f --connect-timeout 10 https://ifconfig.co > /dev/null; '
'fi'
],
'interval': 30 * 1000000000, # Check every 30 seconds (30 * 1e9 nanoseconds)
'timeout': 15 * 1000000000, # Timeout after 15 seconds (15 * 1e9 nanoseconds)
'retries': 5, # Retry 5 times on failure
'start_period': 15 * 1000000000 # Grace period of 15 seconds after start
},
# Add labels for better identification
labels={
'service': 'ytdlp',
'account_id': account_id
}
)
# Wait for container to be running (skip health check verification)
start_time = time.time()
while True:
container.reload()
if container.status == 'running':
break
if time.time() - start_time > 10: # 10 second timeout
raise TimeoutError("Container failed to start within 10 seconds")
time.sleep(1)
logging.info(f"Container started: {container.id} (health check verification skipped)")
# Push container details immediately after creation using simplified keys
context['task_instance'].xcom_push(key='container_id', value=container.id)
context['task_instance'].xcom_push(key='container_name', value=container_name)
logging.info(f"Pushed container_id={container.id} and container_name={container_name} to XCom.")
# --- Determine Host for Sensor ---
# Get the container's IP address on the specified network for the HttpSensor
try:
container.reload() # Refresh container attributes
network_settings = container.attrs.get('NetworkSettings', {}).get('Networks', {})
if network_name in network_settings:
host_for_sensor = network_settings[network_name].get('IPAddress')
if not host_for_sensor:
raise ValueError(f"Container {container.id} has no IPAddress on network '{network_name}'")
logging.info(f"Using container IP '{host_for_sensor}' on network '{network_name}' for HttpSensor.")
else:
# Fallback or error if container not on expected network
logging.error(f"Container {container.id} is not attached to the expected network '{network_name}'. Network settings: {network_settings}")
# Option 1: Fallback to container name (might fail as observed)
# host_for_sensor = container_name
# logging.warning(f"Falling back to container name '{host_for_sensor}' for sensor.")
# Option 2: Raise error
raise ValueError(f"Container {container.id} not found on network '{network_name}'. Cannot determine IP for sensor.")
except Exception as e:
logging.error(f"Failed to get container IP address: {e}", exc_info=True)
raise AirflowException(f"Failed to determine IP address for HttpSensor: {e}")
# Ensure we don't use 0.0.0.0 or empty string for the sensor
if not host_for_sensor or host_for_sensor == "0.0.0.0":
raise ValueError(f"Determined host_for_sensor is invalid ('{host_for_sensor}'). Check container network attachment and IP assignment.")
# --- Add extra logging before pushing ---
logging.info(f"FINAL CHECK before XCom push:")
logging.info(f" Account ID: {account_id}")
logging.info(f" Host for Sensor (IP Address): {host_for_sensor}")
logging.info(f" Host for Registration: {host_for_registration}")
logging.info(f" Service Port: {port}")
logging.info(f" Health Port: {health_port}")
logging.info(f" Pushing to XCom key: service_host with value: {host_for_sensor}")
# --- End extra logging ---
# Push distinct service connection details using simplified keys
context['task_instance'].xcom_push(key='service_host_registration', value=host_for_registration) # For client discovery (e.g., Redis)
context['task_instance'].xcom_push(key='service_host', value=host_for_sensor) # IP Address for HttpSensor
context['task_instance'].xcom_push(key='service_port', value=port) # Port is the same
context['task_instance'].xcom_push(key='service_health_port', value=health_port) # Health port is the same
logging.info(f"Pushed host_for_sensor (IP Address)={host_for_sensor} to XCom key 'service_host'")
logging.info(f"Pushed host_for_registration={host_for_registration} to XCom key 'service_host_registration'")
# Store account metadata in Redis only if redis_enabled is True
# This uses the 'host_for_registration' for client discovery
if redis_enabled:
store_account_metadata(account_id, host_for_registration, port, proxy, health_port, container.id)
# If we reach here, deployment is considered successful for now
logging.info("Deployment preparation successful.")
# Return values are implicitly pushed to XCom (but we pushed explicitly above)
return context_dir, host_for_registration, port
except Exception as e:
logging.error(f"Error during service deployment: {e}", exc_info=True)
# Attempt to cleanup the container if it was created before the error
try:
if 'container' in locals() and container and container.id:
logging.warning(f"Attempting to stop and remove container {container.id} due to deployment error.")
container.stop(timeout=5)
container.remove(force=True)
logging.info(f"Successfully stopped and removed container {container.id} after error.")
elif 'container_name' in locals() and container_name:
# Try finding by name if ID wasn't captured
containers = client.containers.list(filters={'name': container_name})
if containers:
logging.warning(f"Attempting to stop and remove container {containers[0].name} by name due to deployment error.")
containers[0].stop(timeout=5)
containers[0].remove(force=True)
logging.info(f"Successfully stopped and removed container {containers[0].name} after error.")
except Exception as cleanup_err:
logging.error(f"Failed during post-error container cleanup: {cleanup_err}")
raise # Re-raise the original exception to fail the task
# Removed the old monitor_health PythonOperator
# stop_service and cleanup_service are now defined directly in the DAG below.
def check_service_health(ti=None, **context):
"""
Periodically checks the service's /health endpoint using requests.
Acts as a long-running sentinel task. Fails if the health check fails
repeatedly or times out.
"""
# Get parameters from XCom
host_reg = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_host_registration')
host_svc = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_host')
health_port = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_health_port')
# Determine the host to use (prioritize registration host)
host = host_reg if host_reg and host_reg != '0.0.0.0' else host_svc
if not host or not health_port:
raise AirflowException("Could not retrieve host or health_port from XCom for health check.")
health_url = f"http://{host}:{health_port}/health"
logging.info(f"Starting health check for: {health_url}")
# Get configuration for polling
# Use task's execution_timeout if available, otherwise default to 1 year
task_timeout = ti.task.execution_timeout or timedelta(days=365)
poke_interval = 60 # Check every 60 seconds (adjust as needed)
start_time = time.monotonic()
timeout_seconds = task_timeout.total_seconds()
consecutive_error_start_time = None # Track start time of consecutive connection errors
error_retry_window = 10 # Seconds to retry connection errors before failing
while True:
current_time = time.monotonic()
if current_time - start_time > timeout_seconds:
raise AirflowException(f"Health check timed out after {timeout_seconds} seconds for {health_url}")
try:
# Use a reasonable timeout for the individual request
response = requests.get(health_url, timeout=15) # 15 second request timeout
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
# Check response content if needed (optional)
# Example: Check for specific JSON content
# try:
# data = response.json()
# if data.get("status") == "healthy":
# logging.info(f"Health check successful: Status {response.status_code}")
# else:
# logging.warning(f"Health check OK (Status {response.status_code}), but content unexpected: {data}")
# except requests.exceptions.JSONDecodeError:
# logging.warning(f"Health check OK (Status {response.status_code}), but response is not valid JSON.")
# If we got a 2xx status, log success and reset error timer if needed
if consecutive_error_start_time is not None:
logging.info(f"Connection to {health_url} recovered.")
consecutive_error_start_time = None
logging.info(f"Health check successful: Status {response.status_code} for {health_url}")
except requests.exceptions.Timeout:
current_monotonic_time = time.monotonic()
if consecutive_error_start_time is None:
consecutive_error_start_time = current_monotonic_time
logging.warning(f"Health check request timed out for {health_url}. Starting {error_retry_window}s retry window...")
else:
elapsed_error_time = current_monotonic_time - consecutive_error_start_time
if elapsed_error_time > error_retry_window:
error_msg = f"Health check failed for {health_url}: Timeout persisted for over {error_retry_window} seconds."
logging.error(error_msg)
raise AirflowException(error_msg)
else:
logging.warning(f"Health check request timed out for {health_url}. Retrying within {error_retry_window}s window ({elapsed_error_time:.1f}s elapsed)...")
except requests.exceptions.ConnectionError as e:
# Check if the error is specifically "Connection refused" - fail immediately
if "[Errno 111] Connection refused" in str(e):
logging.error(f"Health check failed for {health_url}: Connection refused. Failing task immediately.")
raise AirflowException(f"Health check failed for {health_url}: Connection refused")
else:
# Handle other connection errors with the retry window
current_monotonic_time = time.monotonic()
if consecutive_error_start_time is None:
consecutive_error_start_time = current_monotonic_time
logging.warning(f"Health check connection error for {health_url}: {e}. Starting {error_retry_window}s retry window...")
else:
elapsed_error_time = current_monotonic_time - consecutive_error_start_time
if elapsed_error_time > error_retry_window:
error_msg = f"Health check failed for {health_url}: Connection error persisted for over {error_retry_window} seconds. Last error: {e}"
logging.error(error_msg)
raise AirflowException(error_msg)
else:
logging.warning(f"Health check connection error for {health_url}: {e}. Retrying within {error_retry_window}s window ({elapsed_error_time:.1f}s elapsed)...")
except requests.exceptions.HTTPError as e:
# This catches 4xx/5xx errors - fail immediately
logging.error(f"Health check failed for {health_url}: Status {e.response.status_code}. Failing task.")
# Fail the task immediately on HTTP error
raise AirflowException(f"Health check failed for {health_url}: Status {e.response.status_code}")
except requests.exceptions.RequestException as e:
logging.error(f"Health check failed for {health_url} with unexpected error: {e}. Failing task.")
# Fail the task immediately on other request errors
raise AirflowException(f"Health check failed for {health_url}: {e}")
except Exception as e:
# Catch any other unexpected errors during the check
logging.error(f"Unexpected error during health check for {health_url}: {e}", exc_info=True)
raise AirflowException(f"Unexpected error during health check: {e}")
# Wait for the poke interval before the next check
time.sleep(poke_interval)
def _wait_forever():
"""Sleeps indefinitely (or until task timeout) to simulate a running service."""
logging.info("Sentinel task started. Sleeping in a loop...")
# Sleep in a loop with a reasonable interval to avoid OverflowError
# The task will keep running until it times out based on execution_timeout
# or is manually stopped/failed.
while True:
try:
# Sleep for a long interval (e.g., 1 day)
# You can adjust this interval if needed.
time.sleep(86400) # Sleep for 24 hours
except KeyboardInterrupt:
logging.info("Sentinel task interrupted. Exiting.")
break
except Exception as e:
# Log other potential errors during sleep, though unlikely
logging.error(f"Error during sentinel sleep loop: {e}")
# Optionally break or continue based on error handling strategy
break # Exit loop on unexpected error
def stop_service(**context):
"""Stop the running Docker container with verification."""
# Retrieve account_id from params or kwargs
account_id = context.get('params', {}).get('account_id') or context.get('account_id')
if not account_id:
raise ValueError("Account ID is missing.")
# Initialize Docker client to connect to docker-socket-proxy
client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
try:
# For testing, try to get container ID from environment if XCom is not available
container_id = None
if 'ti' in context:
# Use simplified XCom key
container_id = context['ti'].xcom_pull(task_ids='prepare_and_deploy', key='container_id')
if not container_id:
# If not found in XCom, try to find container by account_id pattern (keep this fallback)
containers = client.containers.list(filters={"name": f"ytdlp_service_{account_id}"})
if containers:
container = containers[0]
container_id = container.id
logging.info(f"Found container by name pattern: {container.name} (ID: {container_id})")
else:
logging.warning(f"No container found for account {account_id} - nothing to stop")
return
if container_id:
# If found in XCom, stop by container ID
container = client.containers.get(container_id)
# Verify container is running before stopping
if container.status != 'running':
logging.warning(f"Container {container_id} is not running (status: {container.status})")
return
logging.info(f"Stopping container {container_id}...")
container.stop(timeout=10) # 10 second timeout
# Verify container is stopped
container.reload()
if container.status == 'exited':
logging.info(f"Successfully stopped container {container_id}")
else:
logging.error(f"Container {container_id} failed to stop (status: {container.status})")
raise RuntimeError(f"Container {container_id} failed to stop")
# Clear Redis entries only if redis_enabled is True
# Retrieve redis_enabled status from DAG run conf or params
redis_enabled = context['dag_run'].conf.get('redis_enabled', False) or context['params'].get('redis_enabled', False)
if redis_enabled:
redis_client = get_redis_connection()
try:
# Verify Redis connection
if not redis_client.ping():
raise ConnectionError("Failed to connect to Redis")
# Remove main metadata
redis_client.delete(f"ytdlp:{account_id}")
# Remove from accounts set
redis_client.srem("ytdlp:accounts", account_id)
logging.info(f"Successfully cleared Redis entries for account: {account_id}")
except Exception as e:
logging.error(f"Failed to clear Redis entries for account {account_id}: {e}")
# Do not raise here, allow container stop to be considered successful
# raise # Optional: re-raise if Redis cleanup failure should fail the task
return
logging.warning(f"No container found for account {account_id} - nothing to stop")
except docker.errors.NotFound as e:
logging.warning(f"Container for account {account_id} not found: {e}")
except Exception as e:
logging.error(f"Failed to stop container: {e}")
raise
def cleanup_service(**context):
"""Cleanup service resources including Redis entries and XCom data."""
# Note: This function is now called within the manual_stop_cleanup TaskGroup
try:
# Retrieve account_id from params first, then from XCom
account_id = context['params'].get('account_id')
if not account_id:
# Try to get it from XCom
account_id = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='account_id')
if not account_id:
logging.warning("Account ID not found in params or XCom - skipping resource cleanup")
return
# Redis cleanup (if redis_enabled=True) is handled in the 'stop_service' task.
logging.info(f"Redis cleanup for account {account_id} is handled by the 'stop_service' task if enabled.")
# Cleanup XCom data (using simplified keys where applicable)
# Note: XCom cleanup is generally not strictly necessary but can be good practice.
# Airflow manages XCom expiry. This code doesn't actually *delete* XComs.
# To truly delete, you'd use the Airflow API or DB directly.
# We'll leave the pull calls here as they don't harm anything.
ti = context['task_instance']
ti.xcom_pull(key='container_id', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='container_name', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='service_host_registration', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='service_host', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='service_port', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='service_health_port', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='work_id', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='context_dir', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='account_id', task_ids='prepare_and_deploy', include_prior_dates=True) # Keep account_id pull
logging.info(f"Pulled XCom data for potential cleanup logging for account: {account_id}")
# Initialize Docker client
client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
container_found_and_removed = False
# Attempt 1: Get container ID from XCom using simplified key
container_id_xcom = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='container_id')
if container_id_xcom:
logging.info(f"Attempting to remove container using XCom ID: {container_id_xcom}")
try:
container = client.containers.get(container_id_xcom)
logging.info(f"Found container {container.id} (Name: {container.name}). Removing...")
container.remove(force=True)
logging.info(f"Successfully removed container {container.id}")
container_found_and_removed = True
except docker.errors.NotFound:
logging.warning(f"Container with XCom ID {container_id_xcom} not found. Trying other methods.")
except Exception as e:
logging.error(f"Error removing container {container_id_xcom}: {e}")
# Attempt 2: Find container by labels if not found/removed via XCom ID
if not container_found_and_removed:
logging.info(f"Attempting to find and remove container by labels: service=ytdlp, account_id={account_id}")
try:
containers = client.containers.list(
filters={'label': [f'service=ytdlp', f'account_id={account_id}']},
all=True # Include stopped containers
)
if containers:
for container in containers:
logging.info(f"Found container {container.id} (Name: {container.name}) by labels. Removing...")
try:
container.remove(force=True)
logging.info(f"Successfully removed container {container.id}")
container_found_and_removed = True # Mark as found even if only one is removed
except Exception as e:
logging.error(f"Error removing container {container.id} found by labels: {e}")
else:
logging.info("No containers found matching labels.")
except Exception as e:
logging.error(f"Error searching for containers by labels: {e}")
# Attempt 3: Find container by name pattern if still not found/removed
if not container_found_and_removed:
container_name_pattern = f"ytdlp_service_{account_id}_*"
logging.info(f"Attempting to find and remove container by name pattern: {container_name_pattern}")
try:
containers = client.containers.list(filters={'name': container_name_pattern}, all=True)
if containers:
for container in containers:
logging.info(f"Found container {container.id} (Name: {container.name}) by name pattern. Removing...")
try:
container.remove(force=True)
logging.info(f"Successfully removed container {container.id}")
container_found_and_removed = True
except Exception as e:
logging.error(f"Error removing container {container.id} found by name: {e}")
else:
logging.info("No containers found matching name pattern.")
except Exception as e:
logging.error(f"Error searching for containers by name: {e}")
if not container_found_and_removed:
logging.warning(f"Could not find or remove any container for account {account_id} using ID, labels, or name.")
# Get context directory from XCom and remove it
context_dir = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='context_dir')
if context_dir and os.path.exists(context_dir):
shutil.rmtree(context_dir)
logging.info(f"Cleaned up working directory: {context_dir}")
except Exception as e:
logging.error(f"Error during cleanup: {e}")
raise
# Define the DAG
with DAG(
'ytdlp_service',
default_args=default_args,
description='Deploy YTDLP token service for ios, android, mweb',
schedule_interval=None,
start_date=days_ago(1), # Use dynamic start date for manually triggered DAG
catchup=False,
tags=['youtube', 'tokens', 'service', 'docker'],
# executor_config moved to default_args
is_paused_upon_creation=False,
params={
'account_id': Param(
'account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09',
type="string",
description="Required: The account ID for which the service is being deployed."
),
'proxy': Param(
'socks5://sslocal-rust-1084:1084',
type=["null", "string"],
description="Optional: The SOCKS5 proxy URL to use for the service (e.g., socks5://host:port)."
),
'clients': Param(
'ios,android,mweb',
type="string",
description="Comma-separated list of client types (e.g., ios,android,mweb)."
),
'redis_enabled': Param(
False,
type="boolean",
description="Use Redis for service discovery? If False, host/port must be provided or will be auto-assigned."
),
'host': Param(
None,
type=["null", "string"],
description="Optional: Host IP for the service. If redis_enabled=False and host is not provided, defaults to '0.0.0.0'. If redis_enabled=True and host is not provided, uses HOST_EXTERNAL_IP or defaults to '0.0.0.0'."
),
'port': Param(
None,
type=["null", "integer"],
description="Optional: Port for the service. If None, a free port will be assigned automatically. If redis_enabled=False and a port is provided, it will be used (after checking availability)."
),
# redis_host and redis_port parameters are removed.
# If redis_enabled=True, the DAG will use the 'redis_default' Airflow connection.
'docker_network': Param(
'airflow_prod_proxynet',
type="string",
description="Optional: The Docker network to attach the container to. Defaults to 'airflow_prod_proxynet'."
),
'exit_on_proxy_fail': Param(
True,
type="boolean",
description="Exit the service container immediately if the initial proxy test fails?"
),
}
) as dag:
# Task to prepare and deploy the service
prepare_and_deploy = PythonOperator(
task_id='prepare_and_deploy',
python_callable=prepare_and_deploy_service,
provide_context=True,
trigger_rule='all_success' # Keep default trigger rule for prepare_and_deploy
)
# Combined Health Check and Sentinel Task using PythonOperator
# This task runs for a long time, checking health periodically using the 'requests' library.
# If the health check fails repeatedly or times out, the task fails, triggering 'stop_service'.
monitor_service_health = PythonOperator(
task_id='monitor_service_health',
python_callable=check_service_health,
provide_context=True,
# Set execution timeout for the task itself (acts as the overall timeout)
execution_timeout=timedelta(days=365), # Long timeout (e.g., 1 year)
# op_kwargs can pass static config, but host/port come from XCom inside the function
# poke_interval and request timeout are handled within check_service_health
)
monitor_service_health.doc_md = """
### Monitor Service Health Task (PythonOperator)
Uses a Python function to periodically check the service's `/health` endpoint using the `requests` library.
Acts as both a health check and a sentinel for the running service.
- **Pulls from XCom:** Reads `service_host_registration`, `service_host`, and `service_health_port` from the `prepare_and_deploy` task to construct the target URL.
- **Polling:** Checks the `/health` endpoint every 60 seconds.
- **Timeout:** Uses the task's `execution_timeout` (set to 1 year) as the overall maximum duration. Individual requests have a 15-second timeout.
- **Failure:** If a health check request returns a 4xx/5xx status code or encounters other request errors, the task fails immediately. If the overall `execution_timeout` is reached without a failure, the task would eventually time out and fail.
"""
# Task to stop the service (runs if monitor_service_health fails)
stop = PythonOperator(
task_id='stop_service',
python_callable=stop_service,
provide_context=True,
trigger_rule=TriggerRule.ONE_FAILED # Run only if monitor_service_health fails
)
stop.doc_md = """
### Stop Service Task
Stops the Docker container associated with the service.
- **Trigger Rule:** `one_failed` - This task only runs if the upstream `monitor_service_health` task fails.
- Pulls container ID/name from XCom or finds it using labels/name patterns.
- Clears Redis entries if `redis_enabled=True`.
"""
# Marker task to indicate that the deployment failed
prepare_failed_marker = EmptyOperator(
task_id='prepare_failed_marker',
trigger_rule=TriggerRule.ONE_FAILED # Run only if 'prepare_and_deploy' fails
)
# Task to cleanup resources (runs after stop sequence OR if prepare fails)
cleanup = PythonOperator(
task_id='cleanup_service',
python_callable=cleanup_service,
provide_context=True,
trigger_rule=TriggerRule.ALL_DONE # Run after upstream (stop or prepare_failed_marker) is done
)
cleanup.doc_md = """
### Cleanup Service Task
Removes the Docker container and cleans up related resources.
- **Trigger Rule:** `all_done` - Runs after the `stop_service` task finishes, whether it succeeded or failed.
- Removes the container using ID from XCom, labels, or name patterns.
- Cleans up XCom variables.
- Removes the context directory.
"""
# Define task dependencies
# Success Path: prepare -> monitor (runs indefinitely)
# Monitor Failure Path: monitor (fails) -> stop -> cleanup
# Prepare Failure Path: prepare (fails) -> prepare_failed_marker -> cleanup
prepare_and_deploy >> monitor_service_health
prepare_and_deploy >> prepare_failed_marker # Trigger marker if prepare fails
monitor_service_health >> stop # Trigger stop if monitor fails
# Cleanup is triggered after stop finishes OR after prepare_failed_marker finishes
stop >> cleanup
prepare_failed_marker >> cleanup

BIN
airflow/dags/.DS_Store vendored Normal file

Binary file not shown.

23
airflow/dags/get_ip.py Normal file
View File

@ -0,0 +1,23 @@
import socket
import logging
logger = logging.getLogger(__name__)
def get_ip_address():
"""
Get the primary IP address of the host.
This is used by Airflow workers to advertise their IP for log serving,
ensuring the webserver can reach them in a multi-host environment.
"""
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
try:
# This doesn't even have to be reachable
s.connect(('10.255.255.255', 1))
ip_address = s.getsockname()[0]
logger.info(f"Determined host IP address as: {ip_address}")
except Exception as e:
logger.warning(f"Could not determine IP address, falling back to 127.0.0.1. Error: {e}")
ip_address = '127.0.0.1'
finally:
s.close()
return ip_address

View File

@ -0,0 +1,86 @@
import logging
import time
import requests
from datetime import datetime
from airflow.decorators import task
from airflow.models.dag import DAG
from airflow.models.param import Param
from airflow.models.variable import Variable
logger = logging.getLogger(__name__)
# Get the master host IP from an Airflow variable, which is set via the .env file.
# This allows the default health check target to be dynamic based on cluster.yml.
DEFAULT_MASTER_IP = Variable.get("MASTER_HOST_IP", default_var="127.0.0.1")
with DAG(
dag_id='proxy_health_check',
start_date=datetime(2023, 1, 1),
schedule=None,
catchup=False,
tags=['monitoring', 'proxy'],
doc_md="""
### Proxy Health Check DAG
This DAG runs a continuous loop to check a target URL through a SOCKS5 proxy.
It is designed for monitoring proxy connectivity and performance. Once triggered, it will run forever
until the DAG run is manually stopped.
**Parameters:**
- `target_url`: The URL to check. Defaults to the internal nginx service.
- `socks5_host`: The SOCKS5 proxy host. For Docker, `host.docker.internal` often works to target the host machine.
- `socks5_port`: The SOCKS5 proxy port.
- `check_interval_seconds`: How often to run the check.
- `latency_threshold_seconds`: A warning will be logged if the request takes longer than this.
- `timeout_seconds`: The timeout for the web request.
""",
params={
'target_url': Param(f'http://{DEFAULT_MASTER_IP}:8888', type='string', description="The URL to check. Defaults to the master node's nginx healthcheck service."),
'socks5_host': Param('sslocal-rust-1087', type='string', description="SOCKS5 proxy host. Use 'host.docker.internal' for Docker host."),
'socks5_port': Param(1087, type='integer', description="SOCKS5 proxy port."),
'check_interval_seconds': Param(25, type='integer', description="Seconds to wait between checks."),
'latency_threshold_seconds': Param(2, type='integer', description="Log a warning if latency exceeds this threshold."),
'timeout_seconds': Param(10, type='integer', description="Request timeout in seconds."),
},
) as dag:
@task
def run_proxy_check_loop(**context):
"""
Continuously checks a URL through a SOCKS5 proxy and logs if latency is high.
This task will run indefinitely until the DAG run is manually stopped or fails.
"""
params = context['params']
target_url = params['target_url']
proxy_host = params['socks5_host']
proxy_port = params['socks5_port']
interval = params['check_interval_seconds']
threshold = params['latency_threshold_seconds']
timeout = params['timeout_seconds']
proxy_url = f"socks5h://{proxy_host}:{proxy_port}"
proxies = {
'http': proxy_url,
'https': proxy_url,
}
logger.info(f"Starting proxy health check loop. Target: {target_url}, Proxy: {proxy_url}, Interval: {interval}s, Threshold: {threshold}s")
while True:
start_time = time.time()
try:
response = requests.get(target_url, proxies=proxies, timeout=timeout)
response.raise_for_status()
latency = time.time() - start_time
if latency > threshold:
logger.warning(f"High latency detected! Latency: {latency:.2f}s, Threshold: {threshold}s, Target: {target_url}")
except requests.exceptions.RequestException as e:
latency = time.time() - start_time
logger.error(f"Proxy check failed for {target_url} via {proxy_url}. Latency: {latency:.2f}s. Error: {e}")
time.sleep(interval)
run_proxy_check_loop()

View File

@ -207,9 +207,9 @@ def manage_system_callable(**context):
# --- Validate Action/Entity Combination and Parameters --- # --- Validate Action/Entity Combination and Parameters ---
valid_actions = { valid_actions = {
"proxy": ["list_with_status", "ban", "unban", "unban_all", "delete_from_redis"], "proxy": ["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis"],
"account": ["list_with_status", "ban", "unban", "unban_all", "delete_from_redis"], "account": ["list_with_status", "ban", "unban", "unban_all", "delete_from_redis"],
"accounts_and_proxies": ["list_with_status", "ban", "unban", "unban_all", "delete_from_redis"], "accounts_and_proxies": ["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis"],
"airflow_meta": ["clear_dag_runs"], "airflow_meta": ["clear_dag_runs"],
} }
@ -221,7 +221,7 @@ def manage_system_callable(**context):
# Validate required parameters for the chosen action # Validate required parameters for the chosen action
if entity == "proxy": if entity == "proxy":
if action in ["ban", "unban", "unban_all"] and not server_identity: if action in ["ban", "unban"] and not server_identity:
raise ValueError(f"A 'server_identity' is required for proxy action '{action}'.") raise ValueError(f"A 'server_identity' is required for proxy action '{action}'.")
if action in ["ban", "unban"] and not proxy_url: if action in ["ban", "unban"] and not proxy_url:
raise ValueError(f"A 'proxy_url' is required for proxy action '{action}'.") raise ValueError(f"A 'proxy_url' is required for proxy action '{action}'.")
@ -233,8 +233,6 @@ def manage_system_callable(**context):
# --- Handle Airflow Meta actions separately as they don't use Thrift --- # --- Handle Airflow Meta actions separately as they don't use Thrift ---
if entity == "airflow_meta": if entity == "airflow_meta":
dag_id = params.get("dag_id_to_manage") dag_id = params.get("dag_id_to_manage")
if not dag_id:
raise AirflowException("An 'dag_id_to_manage' is required for airflow_meta actions.")
if action == "clear_dag_runs": if action == "clear_dag_runs":
clear_scope = params.get("clear_scope") clear_scope = params.get("clear_scope")
@ -277,27 +275,39 @@ def manage_system_callable(**context):
# --- Delete Proxy --- # --- Delete Proxy ---
proxy_url = params.get("proxy_url") proxy_url = params.get("proxy_url")
server_identity = params.get("server_identity") server_identity = params.get("server_identity")
if not proxy_url:
raise ValueError("A 'proxy_url' is required for proxy action 'delete_from_redis'.")
if not server_identity:
raise ValueError("A 'server_identity' is required for proxy action 'delete_from_redis'.")
proxy_state_key = f"proxies:{server_identity}" if proxy_url and server_identity:
proxy_failure_key = f"proxy_failures:{proxy_url}" proxy_state_key = f"proxy_status:{server_identity}"
logger.warning(f"Deleting proxy '{proxy_url}' state from hash '{proxy_state_key}' and failure key '{proxy_failure_key}' from Redis.") logger.warning(f"Deleting proxy '{proxy_url}' state from hash '{proxy_state_key}' from Redis.")
with redis_client.pipeline() as pipe: with redis_client.pipeline() as pipe:
pipe.hdel(proxy_state_key, proxy_url) pipe.hdel(proxy_state_key, proxy_url)
pipe.delete(proxy_failure_key) results = pipe.execute()
results = pipe.execute()
hdel_result = results[0]
hdel_result = results[0] print(f"\nSuccessfully removed proxy '{proxy_url}' from state hash (result: {hdel_result}).")
del_result = results[1] else:
print(f"\nSuccessfully removed proxy '{proxy_url}' from state hash (result: {hdel_result}) and deleted failure key (result: {del_result}).") logger.warning("No 'proxy_url' or 'server_identity' provided. Deleting ALL proxy state keys from Redis.")
patterns = ["proxy_status:*"]
keys_to_delete = []
for pattern in patterns:
found_keys = [key for key in redis_client.scan_iter(pattern)]
if found_keys:
logger.info(f"Found {len(found_keys)} keys for pattern '{pattern}'.")
keys_to_delete.extend(found_keys)
else:
logger.info(f"No keys found for pattern '{pattern}'.")
if not keys_to_delete:
print("\nNo proxy keys found to delete.\n")
else:
print(f"\nWARNING: Found {len(keys_to_delete)} proxy-related keys to remove from Redis.")
deleted_count = redis_client.delete(*keys_to_delete)
print(f"\nSuccessfully removed {deleted_count} proxy-related keys from Redis.\n")
# --- Delete Account --- # --- Delete Account ---
account_prefix = params.get("account_id") # Repurpose account_id param as an optional prefix account_prefix = params.get("account_id")
pattern = f"account_status:{account_prefix}*" if account_prefix else "account_status:*" pattern = f"account_status:{account_prefix}*" if account_prefix else "account_status:*"
logger.warning(f"Searching for account status keys in Redis with pattern: '{pattern}'") logger.warning(f"Searching for account status keys in Redis with pattern: '{pattern}'")
@ -340,24 +350,37 @@ def manage_system_callable(**context):
elif entity == "proxy": elif entity == "proxy":
proxy_url = params.get("proxy_url") proxy_url = params.get("proxy_url")
server_identity = params.get("server_identity") server_identity = params.get("server_identity")
if not proxy_url:
raise ValueError("A 'proxy_url' is required for proxy action 'delete_from_redis'.")
if not server_identity:
raise ValueError("A 'server_identity' is required for proxy action 'delete_from_redis'.")
proxy_state_key = f"proxies:{server_identity}" if proxy_url and server_identity:
proxy_failure_key = f"proxy_failures:{proxy_url}" proxy_state_key = f"proxy_status:{server_identity}"
logger.warning(f"Deleting proxy '{proxy_url}' state from hash '{proxy_state_key}' and failure key '{proxy_failure_key}' from Redis.") logger.warning(f"Deleting proxy '{proxy_url}' state from hash '{proxy_state_key}' from Redis.")
with redis_client.pipeline() as pipe: with redis_client.pipeline() as pipe:
pipe.hdel(proxy_state_key, proxy_url) pipe.hdel(proxy_state_key, proxy_url)
pipe.delete(proxy_failure_key) results = pipe.execute()
results = pipe.execute()
hdel_result = results[0]
hdel_result = results[0] print(f"\nSuccessfully removed proxy '{proxy_url}' from state hash (result: {hdel_result}).\n")
del_result = results[1] else:
print(f"\nSuccessfully removed proxy '{proxy_url}' from state hash (result: {hdel_result}) and deleted failure key (result: {del_result}).\n") logger.warning("No 'proxy_url' or 'server_identity' provided. Deleting ALL proxy state keys from Redis.")
patterns = ["proxy_status:*"]
keys_to_delete = []
for pattern in patterns:
found_keys = [key for key in redis_client.scan_iter(pattern)]
if found_keys:
logger.info(f"Found {len(found_keys)} keys for pattern '{pattern}'.")
keys_to_delete.extend(found_keys)
else:
logger.info(f"No keys found for pattern '{pattern}'.")
if not keys_to_delete:
print("\nNo proxy keys found to delete.\n")
return
print(f"\nWARNING: Found {len(keys_to_delete)} proxy-related keys to remove from Redis.")
deleted_count = redis_client.delete(*keys_to_delete)
print(f"\nSuccessfully removed {deleted_count} proxy-related keys from Redis.\n")
return # End execution for this action return # End execution for this action
@ -378,10 +401,66 @@ def manage_system_callable(**context):
logger.info(f"Unbanning proxy '{proxy_url}' for server '{server_identity}'...") logger.info(f"Unbanning proxy '{proxy_url}' for server '{server_identity}'...")
client.unbanProxy(proxy_url, server_identity) client.unbanProxy(proxy_url, server_identity)
print(f"Successfully sent request to unban proxy '{proxy_url}'.") print(f"Successfully sent request to unban proxy '{proxy_url}'.")
elif action == "ban_all":
if server_identity:
logger.info(f"Banning all proxies for server '{server_identity}'...")
client.banAllProxies(server_identity)
print(f"Successfully sent request to ban all proxies for '{server_identity}'.")
else:
logger.info("No server_identity provided. Banning all proxies for ALL servers...")
all_statuses = client.getProxyStatus(None)
if not all_statuses:
print("\nNo proxy statuses found for any server. Nothing to ban.\n")
return
all_server_identities = sorted(list(set(s.serverIdentity for s in all_statuses)))
logger.info(f"Found {len(all_server_identities)} server identities: {all_server_identities}")
print(f"Found {len(all_server_identities)} server identities. Sending ban request for each...")
success_count = 0
fail_count = 0
for identity in all_server_identities:
try:
client.banAllProxies(identity)
logger.info(f" - Sent ban_all for '{identity}'.")
success_count += 1
except Exception as e:
logger.error(f" - Failed to ban all proxies for '{identity}': {e}")
fail_count += 1
print(f"\nSuccessfully sent ban_all requests for {success_count} server identities.")
if fail_count > 0:
print(f"Failed to send ban_all requests for {fail_count} server identities. See logs for details.")
elif action == "unban_all": elif action == "unban_all":
logger.info(f"Unbanning all proxy statuses for server '{server_identity}'...") if server_identity:
client.resetAllProxyStatuses(server_identity) logger.info(f"Unbanning all proxy statuses for server '{server_identity}'...")
print(f"Successfully sent request to unban all proxy statuses for '{server_identity}'.") client.resetAllProxyStatuses(server_identity)
print(f"Successfully sent request to unban all proxy statuses for '{server_identity}'.")
else:
logger.info("No server_identity provided. Unbanning all proxies for ALL servers...")
all_statuses = client.getProxyStatus(None)
if not all_statuses:
print("\nNo proxy statuses found for any server. Nothing to unban.\n")
return
all_server_identities = sorted(list(set(s.serverIdentity for s in all_statuses)))
logger.info(f"Found {len(all_server_identities)} server identities: {all_server_identities}")
print(f"Found {len(all_server_identities)} server identities. Sending unban request for each...")
success_count = 0
fail_count = 0
for identity in all_server_identities:
try:
client.resetAllProxyStatuses(identity)
logger.info(f" - Sent unban_all for '{identity}'.")
success_count += 1
except Exception as e:
logger.error(f" - Failed to unban all proxies for '{identity}': {e}")
fail_count += 1
print(f"\nSuccessfully sent unban_all requests for {success_count} server identities.")
if fail_count > 0:
print(f"Failed to send unban_all requests for {fail_count} server identities. See logs for details.")
elif entity == "account": elif entity == "account":
if action == "list_with_status": if action == "list_with_status":
@ -449,10 +528,64 @@ def manage_system_callable(**context):
logger.info(f"Unbanning proxy '{proxy_url}' for server '{server_identity}'...") logger.info(f"Unbanning proxy '{proxy_url}' for server '{server_identity}'...")
client.unbanProxy(proxy_url, server_identity) client.unbanProxy(proxy_url, server_identity)
print(f"Successfully sent request to unban proxy '{proxy_url}'.") print(f"Successfully sent request to unban proxy '{proxy_url}'.")
elif action == "ban_all":
if server_identity:
logger.info(f"Banning all proxies for server '{server_identity}'...")
client.banAllProxies(server_identity)
print(f"Successfully sent request to ban all proxies for '{server_identity}'.")
else:
logger.info("No server_identity provided. Banning all proxies for ALL servers...")
all_statuses = client.getProxyStatus(None)
if not all_statuses:
print("\nNo proxy statuses found for any server. Nothing to ban.\n")
else:
all_server_identities = sorted(list(set(s.serverIdentity for s in all_statuses)))
logger.info(f"Found {len(all_server_identities)} server identities: {all_server_identities}")
print(f"Found {len(all_server_identities)} server identities. Sending ban request for each...")
success_count = 0
fail_count = 0
for identity in all_server_identities:
try:
client.banAllProxies(identity)
logger.info(f" - Sent ban_all for '{identity}'.")
success_count += 1
except Exception as e:
logger.error(f" - Failed to ban all proxies for '{identity}': {e}")
fail_count += 1
print(f"\nSuccessfully sent ban_all requests for {success_count} server identities.")
if fail_count > 0:
print(f"Failed to send ban_all requests for {fail_count} server identities. See logs for details.")
elif action == "unban_all": elif action == "unban_all":
logger.info(f"Unbanning all proxy statuses for server '{server_identity}'...") if server_identity:
client.resetAllProxyStatuses(server_identity) logger.info(f"Unbanning all proxy statuses for server '{server_identity}'...")
print(f"Successfully sent request to unban all proxy statuses for '{server_identity}'.") client.resetAllProxyStatuses(server_identity)
print(f"Successfully sent request to unban all proxy statuses for '{server_identity}'.")
else:
logger.info("No server_identity provided. Unbanning all proxies for ALL servers...")
all_statuses = client.getProxyStatus(None)
if not all_statuses:
print("\nNo proxy statuses found for any server. Nothing to unban.\n")
else:
all_server_identities = sorted(list(set(s.serverIdentity for s in all_statuses)))
logger.info(f"Found {len(all_server_identities)} server identities: {all_server_identities}")
print(f"Found {len(all_server_identities)} server identities. Sending unban request for each...")
success_count = 0
fail_count = 0
for identity in all_server_identities:
try:
client.resetAllProxyStatuses(identity)
logger.info(f" - Sent unban_all for '{identity}'.")
success_count += 1
except Exception as e:
logger.error(f" - Failed to unban all proxies for '{identity}': {e}")
fail_count += 1
print(f"\nSuccessfully sent unban_all requests for {success_count} server identities.")
if fail_count > 0:
print(f"Failed to send unban_all requests for {fail_count} server identities. See logs for details.")
except Exception as proxy_e: except Exception as proxy_e:
logger.error(f"Error during proxy action '{action}': {proxy_e}", exc_info=True) logger.error(f"Error during proxy action '{action}': {proxy_e}", exc_info=True)
print(f"\nERROR during proxy action: {proxy_e}") print(f"\nERROR during proxy action: {proxy_e}")
@ -552,15 +685,16 @@ with DAG(
"action": Param( "action": Param(
"list_with_status", "list_with_status",
type="string", type="string",
enum=["list_with_status", "ban", "unban", "unban_all", "delete_from_redis", "clear_dag_runs"], enum=["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis", "clear_dag_runs"],
description="""The management action to perform. description="""The management action to perform.
--- ---
#### Actions for `entity: proxy` #### Actions for `entity: proxy`
- `list_with_status`: View status of all proxies, optionally filtered by `server_identity`. - `list_with_status`: View status of all proxies, optionally filtered by `server_identity`.
- `ban`: Ban a specific proxy for a given `server_identity`. Requires `proxy_url`. - `ban`: Ban a specific proxy for a given `server_identity`. Requires `proxy_url`.
- `unban`: Un-ban a specific proxy. Requires `proxy_url`. - `unban`: Un-ban a specific proxy. Requires `proxy_url`.
- `unban_all`: Resets the status of all proxies for a given `server_identity` to `ACTIVE`. - `ban_all`: Sets the status of all proxies for a given `server_identity` (or all servers) to `BANNED`.
- `delete_from_redis`: **(Destructive)** Deletes a proxy's state from Redis for a specific `server_identity`. This removes its state (ACTIVE/BANNED) and its failure history. The server will re-create it with a default `ACTIVE` state on its next refresh if the proxy is still in the server's configuration. Use this to reset a single proxy's state completely. Requires `proxy_url` and `server_identity`. - `unban_all`: Resets the status of all proxies for a given `server_identity` (or all servers) to `ACTIVE`.
- `delete_from_redis`: **(Destructive)** Deletes proxy **state** from Redis. This action does not remove the proxy from the service's configuration, but rather resets its status (ban/active, success/failure counts) to the default. The service will continue to manage the proxy. If `proxy_url` and `server_identity` are provided, it deletes a single proxy's state. If they are omitted, it deletes **ALL** proxy state keys (`proxy_status:*`).
#### Actions for `entity: account` #### Actions for `entity: account`
- `list_with_status`: View status of all accounts, optionally filtered by `account_id` (as a prefix). - `list_with_status`: View status of all accounts, optionally filtered by `account_id` (as a prefix).
@ -574,8 +708,9 @@ with DAG(
- `list_with_status`: View statuses for both proxies and accounts. - `list_with_status`: View statuses for both proxies and accounts.
- `ban`: Ban a specific proxy AND a specific account. Requires `proxy_url`, `server_identity`, and `account_id`. - `ban`: Ban a specific proxy AND a specific account. Requires `proxy_url`, `server_identity`, and `account_id`.
- `unban`: Un-ban a specific proxy AND a specific account. Requires `proxy_url`, `server_identity`, and `account_id`. - `unban`: Un-ban a specific proxy AND a specific account. Requires `proxy_url`, `server_identity`, and `account_id`.
- `unban_all`: Un-ban all proxies for a `server_identity` AND all accounts (optionally filtered by `account_id` as a prefix). - `ban_all`: Ban all proxies for a `server_identity` (or all servers). Does not affect accounts.
- `delete_from_redis`: Deletes a specific proxy's state AND all accounts matching a prefix from Redis. - `unban_all`: Un-ban all proxies for a `server_identity` (or all servers) AND all accounts (optionally filtered by `account_id` as a prefix).
- `delete_from_redis`: Deletes proxy and account **state** from Redis. For proxies, this resets their status but they remain managed by the service. For accounts, this permanently removes them from the system's tracking. If `proxy_url` and `server_identity` are provided, it deletes a single proxy's state. If they are omitted, it deletes **ALL** proxy state (keys matching `proxy_status:*`). It will also delete all accounts matching the `account_id` prefix (or all accounts if `account_id` is empty).
#### Actions for `entity: airflow_meta` #### Actions for `entity: airflow_meta`
- `clear_dag_runs`: **(Destructive)** Deletes DAG run history and associated task instances from the database, removing them from the UI. This allows the runs to be re-created if backfilling is enabled. - `clear_dag_runs`: **(Destructive)** Deletes DAG run history and associated task instances from the database, removing them from the UI. This allows the runs to be re-created if backfilling is enabled.
@ -584,9 +719,9 @@ with DAG(
""", """,
), ),
"server_identity": Param( "server_identity": Param(
"ytdlp-ops-airflow-service", None,
type=["null", "string"], type=["null", "string"],
description="The identity of the server instance (for proxy management).", description="The identity of the server instance (for proxy management). Leave blank to list all.",
), ),
"proxy_url": Param( "proxy_url": Param(
None, None,

View File

@ -549,7 +549,7 @@ with DAG(
""", """,
params={ params={
"action": Param( "action": Param(
"add_videos", "list_contents",
type="string", type="string",
enum=["add_videos", "clear_queue", "list_contents", "check_status", "requeue_failed", "inspect_celery_cluster"], enum=["add_videos", "clear_queue", "list_contents", "check_status", "requeue_failed", "inspect_celery_cluster"],
title="Action", title="Action",

View File

@ -0,0 +1,144 @@
# -*- coding: utf-8 -*-
#
# Copyright © 2024 rl
#
# Distributed under terms of the MIT license.
"""
Maintenance DAG for managing the lifecycle of ytdlp-ops accounts.
This DAG is responsible for:
- Un-banning accounts whose ban duration has expired.
- Transitioning accounts from RESTING to ACTIVE after their cooldown period.
- Transitioning accounts from ACTIVE to RESTING after their active duration.
This logic was previously handled inside the ytdlp-ops-server and has been
moved here to give the orchestrator full control over account state.
"""
from __future__ import annotations
import logging
import time
from datetime import datetime
from airflow.decorators import task
from airflow.models import Variable
from airflow.models.dag import DAG
from airflow.utils.dates import days_ago
# Import utility functions and Thrift modules
from utils.redis_utils import _get_redis_client
from pangramia.yt.tokens_ops import YTTokenOpService
from thrift.protocol import TBinaryProtocol
from thrift.transport import TSocket, TTransport
# Configure logging
logger = logging.getLogger(__name__)
# Default settings from Airflow Variables or hardcoded fallbacks
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
DEFAULT_ARGS = {
'owner': 'airflow',
'retries': 1,
'retry_delay': 30,
'queue': 'maintenance',
}
# --- Helper Functions ---
def _get_thrift_client(host, port, timeout=60):
"""Helper to create and connect a Thrift client."""
transport = TSocket.TSocket(host, port)
transport.setTimeout(timeout * 1000)
transport = TTransport.TFramedTransport(transport)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = YTTokenOpService.Client(protocol)
transport.open()
logger.info(f"Connected to Thrift server at {host}:{port}")
return client, transport
@task
def manage_account_states():
"""
Fetches all account statuses and performs necessary state transitions.
"""
host = DEFAULT_YT_AUTH_SERVICE_IP
port = int(DEFAULT_YT_AUTH_SERVICE_PORT)
redis_conn_id = DEFAULT_REDIS_CONN_ID
client, transport = None, None
try:
client, transport = _get_thrift_client(host, port)
redis_client = _get_redis_client(redis_conn_id)
logger.info("Fetching all account statuses from the service...")
all_accounts = client.getAccountStatus(accountPrefix=None)
logger.info(f"Found {len(all_accounts)} accounts to process.")
accounts_to_unban = []
accounts_to_activate = []
accounts_to_rest = []
for acc in all_accounts:
if acc.status == "BANNED (expired)":
accounts_to_unban.append(acc.accountId)
elif acc.status == "RESTING (expired)":
accounts_to_activate.append(acc.accountId)
elif acc.status == "ACTIVE (should be resting)":
accounts_to_rest.append(acc.accountId)
# --- Perform State Transitions ---
# 1. Un-ban accounts via Thrift call
if accounts_to_unban:
logger.info(f"Un-banning {len(accounts_to_unban)} accounts: {accounts_to_unban}")
for acc_id in accounts_to_unban:
try:
client.unbanAccount(acc_id, "Automatic un-ban by Airflow maintenance DAG.")
logger.info(f"Successfully un-banned account '{acc_id}'.")
except Exception as e:
logger.error(f"Failed to un-ban account '{acc_id}': {e}")
# 2. Activate resting accounts via direct Redis write
if accounts_to_activate:
logger.info(f"Activating {len(accounts_to_activate)} accounts: {accounts_to_activate}")
now_ts = int(time.time())
with redis_client.pipeline() as pipe:
for acc_id in accounts_to_activate:
key = f"account_status:{acc_id}"
pipe.hset(key, "status", "ACTIVE")
pipe.hset(key, "active_since_timestamp", now_ts)
pipe.hset(key, "status_changed_timestamp", now_ts)
pipe.execute()
logger.info("Finished activating accounts.")
# 3. Rest active accounts via direct Redis write
if accounts_to_rest:
logger.info(f"Putting {len(accounts_to_rest)} accounts to rest: {accounts_to_rest}")
now_ts = int(time.time())
with redis_client.pipeline() as pipe:
for acc_id in accounts_to_rest:
key = f"account_status:{acc_id}"
pipe.hset(key, "status", "RESTING")
pipe.hset(key, "status_changed_timestamp", now_ts)
pipe.execute()
logger.info("Finished putting accounts to rest.")
finally:
if transport and transport.isOpen():
transport.close()
with DAG(
dag_id='ytdlp_ops_account_maintenance',
default_args=DEFAULT_ARGS,
schedule='*/5 * * * *', # Run every 5 minutes
start_date=days_ago(1),
catchup=False,
tags=['ytdlp', 'maintenance'],
doc_md=__doc__,
) as dag:
manage_account_states()

View File

@ -39,6 +39,7 @@ def dispatch_url_to_worker(**context):
url_bytes = client.lpop(inbox_queue) url_bytes = client.lpop(inbox_queue)
if not url_bytes: if not url_bytes:
logger.info("Redis queue is empty. No work to dispatch. Skipping task.")
raise AirflowSkipException("Redis queue is empty. No work to dispatch.") raise AirflowSkipException("Redis queue is empty. No work to dispatch.")
url_to_process = url_bytes.decode('utf-8') url_to_process = url_bytes.decode('utf-8')

View File

@ -253,7 +253,7 @@ with DAG(
# --- Worker Passthrough Parameters --- # --- Worker Passthrough Parameters ---
'on_bannable_failure': Param( 'on_bannable_failure': Param(
'retry_with_new_account', 'stop_loop',
type="string", type="string",
enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error'], enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error'],
title="[Worker Param] On Bannable Failure Policy", title="[Worker Param] On Bannable Failure Policy",
@ -264,7 +264,7 @@ with DAG(
), ),
'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."), 'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."),
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."), 'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
'clients': Param('mweb,ios,android', type="string", description="[Worker Param] Comma-separated list of clients for token generation."), 'clients': Param('web', type="string", description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, mweb, ios, android, web_safari, web_embedded, web_music, web_creator"),
'account_pool': Param('ytdlp_account', type="string", description="[Worker Param] Account pool prefix or comma-separated list."), 'account_pool': Param('ytdlp_account', type="string", description="[Worker Param] Account pool prefix or comma-separated list."),
'account_pool_size': Param(10, type=["integer", "null"], description="[Worker Param] If using a prefix for 'account_pool', this specifies the number of accounts to generate (e.g., 10 for 'prefix_01' through 'prefix_10'). Required when using a prefix."), 'account_pool_size': Param(10, type=["integer", "null"], description="[Worker Param] If using a prefix for 'account_pool', this specifies the number of accounts to generate (e.g., 10 for 'prefix_01' through 'prefix_10'). Required when using a prefix."),
'service_ip': Param(DEFAULT_YT_AUTH_SERVICE_IP, type="string", description="[Worker Param] IP of the ytdlp-ops-server. Default is from Airflow variable YT_AUTH_SERVICE_IP or hardcoded."), 'service_ip': Param(DEFAULT_YT_AUTH_SERVICE_IP, type="string", description="[Worker Param] IP of the ytdlp-ops-server. Default is from Airflow variable YT_AUTH_SERVICE_IP or hardcoded."),

View File

@ -22,6 +22,7 @@ from airflow.models.param import Param
from airflow.models.xcom_arg import XComArg from airflow.models.xcom_arg import XComArg
from airflow.operators.dummy import DummyOperator from airflow.operators.dummy import DummyOperator
from airflow.utils.dates import days_ago from airflow.utils.dates import days_ago
from airflow.utils.task_group import TaskGroup
from airflow.api.common.trigger_dag import trigger_dag from airflow.api.common.trigger_dag import trigger_dag
from datetime import datetime, timedelta from datetime import datetime, timedelta
import json import json
@ -49,7 +50,7 @@ logger = logging.getLogger(__name__)
# Default settings from Airflow Variables or hardcoded fallbacks # Default settings from Airflow Variables or hardcoded fallbacks
DEFAULT_QUEUE_NAME = 'video_queue' DEFAULT_QUEUE_NAME = 'video_queue'
DEFAULT_REDIS_CONN_ID = 'redis_default' DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_TIMEOUT = 600 DEFAULT_TIMEOUT = 3600
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1") DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080) DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
@ -226,25 +227,125 @@ def get_token(initial_data: dict, **context):
@task.branch @task.branch
def handle_bannable_error_branch(task_id_to_check: str, **context): def handle_bannable_error_branch(task_id_to_check: str, **context):
"""Inspects a failed task and routes to retry logic if the error is bannable.""" """
Inspects a failed task and routes to retry logic if the error is retryable.
Routes to a fatal error handler for non-retryable infrastructure issues.
"""
ti = context['task_instance'] ti = context['task_instance']
params = context['params'] params = context['params']
error_details = ti.xcom_pull(task_ids=task_id_to_check, key='error_details') error_details = ti.xcom_pull(task_ids=task_id_to_check, key='error_details')
if not error_details: if not error_details:
return None # Let DAG fail for unexpected errors logger.error(f"Task {task_id_to_check} failed without error details. Marking as fatal.")
return 'handle_fatal_error'
error_code = error_details.get('error_code', '').strip() error_code = error_details.get('error_code', '').strip()
policy = params.get('on_bannable_failure', 'retry_with_new_account') policy = params.get('on_bannable_failure', 'retry_with_new_account')
is_bannable = error_code in ["SOCKS5_CONNECTION_FAILED", "SOCKET_TIMEOUT", "BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED"]
# Fatal Thrift connection errors that should stop all processing.
if error_code == 'TRANSPORT_ERROR':
logger.error(f"Fatal Thrift connection error from '{task_id_to_check}'. Stopping processing.")
return 'handle_fatal_error'
# Service-side connection errors that are potentially retryable.
connection_errors = ['SOCKS5_CONNECTION_FAILED', 'SOCKET_TIMEOUT', 'CAMOUFOX_TIMEOUT']
if error_code in connection_errors:
logger.info(f"Handling connection error '{error_code}' from '{task_id_to_check}'. Policy: '{policy}'")
if policy == 'stop_loop':
logger.warning(f"Connection error with 'stop_loop' policy. Marking as fatal.")
return 'handle_fatal_error'
else:
logger.info("Retrying with a new account without banning.")
return 'assign_new_account_for_direct_retry'
# Bannable errors (e.g., bot detection) that can be retried with a new account.
is_bannable = error_code in ["BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED"]
logger.info(f"Handling failure from '{task_id_to_check}'. Error code: '{error_code}', Policy: '{policy}'") logger.info(f"Handling failure from '{task_id_to_check}'. Error code: '{error_code}', Policy: '{policy}'")
if is_bannable and policy in ['retry_with_new_account', 'retry_and_ban_account_only']: if is_bannable:
return 'ban_account_and_prepare_for_retry' if policy in ['retry_with_new_account', 'retry_and_ban_account_only']:
if is_bannable and policy in ['retry_on_connection_error', 'retry_without_ban']: return 'ban_account_and_prepare_for_retry'
return 'assign_new_account_for_retry' if policy in ['retry_on_connection_error', 'retry_without_ban']:
if is_bannable: # stop_loop return 'assign_new_account_for_direct_retry'
return 'ban_account_and_fail' if policy == 'stop_loop':
return None # Not a bannable error, let DAG fail return 'ban_and_report_immediately'
# Any other error is considered fatal for this run.
logger.error(f"Unhandled or non-retryable error '{error_code}' from '{task_id_to_check}'. Marking as fatal.")
return 'handle_fatal_error'
@task_group(group_id='ban_and_retry_logic')
def ban_and_retry_logic(initial_data: dict):
"""
Task group that checks for sliding window failures before banning an account.
If the account meets ban criteria, it's banned. Otherwise, the ban is skipped
but the retry proceeds.
"""
@task.branch
def check_sliding_window_for_ban(data: dict, **context):
"""
Checks Redis for recent failures. If thresholds are met, proceeds to ban.
Otherwise, proceeds to a dummy task to allow retry without ban.
"""
params = context['params']
account_id = data['account_id']
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
# These thresholds should ideally be Airflow Variables to be configurable
failure_window_seconds = 3600 # 1 hour
failure_threshold_count = 5
failure_threshold_unique_proxies = 3
try:
redis_client = _get_redis_client(redis_conn_id)
failure_key = f"account_failures:{account_id}"
now = time.time()
window_start = now - failure_window_seconds
# 1. Remove old failures and get recent ones
redis_client.zremrangebyscore(failure_key, '-inf', window_start)
recent_failures = redis_client.zrange(failure_key, 0, -1)
if len(recent_failures) >= failure_threshold_count:
# Decode from bytes to string for processing
recent_failures_str = [f.decode('utf-8') for f in recent_failures]
# Failure format is "context:job_id:timestamp"
unique_proxies = {f.split(':')[0] for f in recent_failures_str}
if len(unique_proxies) >= failure_threshold_unique_proxies:
logger.warning(
f"Account {account_id} has failed {len(recent_failures)} times "
f"with {len(unique_proxies)} unique contexts in the last hour. Proceeding to ban."
)
return 'ban_account_task'
else:
logger.info(
f"Account {account_id} has {len(recent_failures)} failures, but only "
f"from {len(unique_proxies)} unique contexts (threshold is {failure_threshold_unique_proxies}). Skipping ban."
)
else:
logger.info(f"Account {account_id} has {len(recent_failures)} failures (threshold is {failure_threshold_count}). Skipping ban.")
except Exception as e:
logger.error(f"Error during sliding window check for account {account_id}: {e}. Skipping ban as a precaution.", exc_info=True)
return 'skip_ban_task'
@task(task_id='ban_account_task')
def ban_account_task(data: dict, **context):
"""Wrapper task to call the main ban_account function."""
ban_account(initial_data=data, reason="Banned by Airflow worker after sliding window check", **context)
@task(task_id='skip_ban_task')
def skip_ban_task():
"""Dummy task to represent the 'skip ban' path."""
pass
check_task = check_sliding_window_for_ban(data=initial_data)
ban_task_in_group = ban_account_task(data=initial_data)
skip_task = skip_ban_task()
check_task >> [ban_task_in_group, skip_task]
@task @task
def ban_account(initial_data: dict, reason: str, **context): def ban_account(initial_data: dict, reason: str, **context):
@ -264,8 +365,8 @@ def ban_account(initial_data: dict, reason: str, **context):
transport.close() transport.close()
@task @task
def assign_new_account_for_retry(initial_data: dict, **context): def assign_new_account_for_direct_retry(initial_data: dict, **context):
"""Selects a new, unused account for the retry attempt.""" """Selects a new, unused account for a direct retry (e.g., after connection error)."""
params = context['params'] params = context['params']
accounts_tried = initial_data['accounts_tried'] accounts_tried = initial_data['accounts_tried']
account_pool = _get_account_pool(params) account_pool = _get_account_pool(params)
@ -285,10 +386,33 @@ def assign_new_account_for_retry(initial_data: dict, **context):
} }
@task @task
def ban_and_fail(initial_data: dict, reason: str, **context): def assign_new_account_after_ban_check(initial_data: dict, **context):
"""Bans an account and then intentionally fails the task to stop the DAG.""" """Selects a new, unused account for the retry attempt after a ban check."""
params = context['params']
accounts_tried = initial_data['accounts_tried']
account_pool = _get_account_pool(params)
available_for_retry = [acc for acc in account_pool if acc not in accounts_tried]
if not available_for_retry:
raise AirflowException("No other accounts available in the pool for a retry.")
new_account_id = random.choice(available_for_retry)
accounts_tried.append(new_account_id)
logger.info(f"Selected new account for retry: '{new_account_id}'")
# Return updated initial_data with new account
return {
'url_to_process': initial_data['url_to_process'],
'account_id': new_account_id,
'accounts_tried': accounts_tried,
}
@task
def ban_and_report_immediately(initial_data: dict, reason: str, **context):
"""Bans an account and prepares for failure reporting and continuing the loop."""
ban_account(initial_data, reason, **context) ban_account(initial_data, reason, **context)
raise AirflowException(f"Failing task as per policy. Reason: {reason}") logger.info(f"Account '{initial_data.get('account_id')}' banned. Proceeding to report failure.")
# This task is a leaf in its path and is followed by the failure reporting task.
return initial_data # Pass data along if needed by reporting
@task @task
def download_and_probe(token_data: dict, **context): def download_and_probe(token_data: dict, **context):
@ -297,6 +421,7 @@ def download_and_probe(token_data: dict, **context):
This version uses subprocess directly with an argument list for better security and clarity. This version uses subprocess directly with an argument list for better security and clarity.
""" """
import subprocess import subprocess
import shlex
params = context['params'] params = context['params']
info_json_path = token_data.get('info_json_path') info_json_path = token_data.get('info_json_path')
@ -316,6 +441,7 @@ def download_and_probe(token_data: dict, **context):
"""Constructs and runs the yt-dlp command, returning the final filename.""" """Constructs and runs the yt-dlp command, returning the final filename."""
cmd = [ cmd = [
'yt-dlp', 'yt-dlp',
'--verbose',
'--load-info-json', info_json_path, '--load-info-json', info_json_path,
'-f', download_format, '-f', download_format,
'-o', full_output_path, '-o', full_output_path,
@ -335,9 +461,10 @@ def download_and_probe(token_data: dict, **context):
if original_url: if original_url:
cmd.append(original_url) cmd.append(original_url)
logger.info(f"Executing yt-dlp command: {' '.join(cmd)}") copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
logger.info(f"Executing yt-dlp command: {copy_paste_cmd}")
process = subprocess.run(cmd, capture_output=True, text=True, timeout=1800) process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
if process.returncode != 0: if process.returncode != 0:
logger.error(f"yt-dlp failed with exit code {process.returncode}") logger.error(f"yt-dlp failed with exit code {process.returncode}")
@ -405,12 +532,100 @@ def mark_url_as_success(initial_data: dict, downloaded_file_path: str, token_dat
logger.info(f"Stored success result for URL '{url}'.") logger.info(f"Stored success result for URL '{url}'.")
@task(trigger_rule='one_failed') @task(trigger_rule='one_failed')
def handle_generic_failure(**context): def report_failure_and_continue(**context):
"""Handles any failure in the DAG by recording a detailed error report to Redis.""" """
# This task is simplified for brevity. The original's detailed logic can be ported here. Handles a failed URL processing attempt by recording a detailed error report to Redis.
logger.error("A failure occurred in the DAG. See previous task logs for details.") This is a common endpoint for various failure paths that should not stop the overall dispatcher loop.
# In a real scenario, this would pull XComs and build a rich report like the original. """
raise AirflowException("Failing task to mark DAG run as failed after error.") params = context['params']
ti = context['task_instance']
url = params.get('url_to_process', 'unknown')
# Collect error details from XCom
error_details = {}
# Check for error details from get_token tasks
first_token_task_id = 'get_token'
retry_token_task_id = 'retry_get_token'
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
# Use the most recent error details
if retry_token_error:
error_details = retry_token_error
elif first_token_error:
error_details = first_token_error
else:
# Check for other possible error sources
# This is a simplified approach - in a real implementation you might want to
# check more task IDs or use a more sophisticated error collection mechanism
pass
logger.error(f"A failure occurred while processing URL '{url}'. Reporting to Redis.")
result_data = {
'status': 'failed',
'end_time': time.time(),
'url': url,
'dag_run_id': context['dag_run'].run_id,
'error_details': error_details
}
try:
client = _get_redis_client(params['redis_conn_id'])
client.hset(f"{params['queue_name']}_result", url, json.dumps(result_data))
logger.info(f"Stored failure result for URL '{url}'.")
except Exception as e:
logger.error(f"Could not report failure to Redis: {e}", exc_info=True)
@task(trigger_rule='one_failed')
def handle_fatal_error(**context):
"""
Handles fatal, non-retryable errors (e.g., infrastructure issues).
This task reports the failure to Redis before failing the DAG run to ensure
failed URLs are queued for later reprocessing, then stops the processing loop.
"""
params = context['params']
ti = context['task_instance']
url = params.get('url_to_process', 'unknown')
# Collect error details
error_details = {}
first_token_task_id = 'get_token'
retry_token_task_id = 'retry_get_token'
first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
# Use the most recent error details
if retry_token_error:
error_details = retry_token_error
elif first_token_error:
error_details = first_token_error
logger.error(f"A fatal, non-retryable error occurred for URL '{url}'. See previous task logs for details.")
# Report failure to Redis so the URL can be reprocessed later
try:
result_data = {
'status': 'failed',
'end_time': time.time(),
'url': url,
'dag_run_id': context['dag_run'].run_id,
'error': 'fatal_error',
'error_message': 'Fatal non-retryable error occurred',
'error_details': error_details
}
client = _get_redis_client(params['redis_conn_id'])
client.hset(f"{params['queue_name']}_result", url, json.dumps(result_data))
logger.info(f"Stored fatal error result for URL '{url}' in Redis for later reprocessing.")
except Exception as e:
logger.error(f"Could not report fatal error to Redis: {e}", exc_info=True)
# Fail the DAG run to prevent automatic continuation of the processing loop
raise AirflowException("Failing DAG due to fatal error. The dispatcher loop will stop.")
@task(trigger_rule='one_success') @task(trigger_rule='one_success')
@ -422,8 +637,10 @@ def continue_processing_loop(**context):
params = context['params'] params = context['params']
dag_run = context['dag_run'] dag_run = context['dag_run']
# Create a new unique run_id for the dispatcher, tied to this worker's run. # Create a new unique run_id for the dispatcher.
new_dispatcher_run_id = f"retriggered_by_{dag_run.run_id}" # Using a timestamp and UUID ensures the ID is unique and does not grow in length over time,
# preventing database errors.
new_dispatcher_run_id = f"retriggered_by_worker_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
# Pass all original parameters from the orchestrator through to the new dispatcher run. # Pass all original parameters from the orchestrator through to the new dispatcher run.
conf_to_pass = {k: v for k, v in params.items() if v is not None} conf_to_pass = {k: v for k, v in params.items() if v is not None}
@ -441,6 +658,48 @@ def continue_processing_loop(**context):
) )
@task.branch(trigger_rule='one_failed')
def handle_retry_failure_branch(task_id_to_check: str, **context):
"""
Inspects a failed retry attempt and decides on the final action.
On retry, most errors are considered fatal for the URL, but not for the system.
"""
ti = context['task_instance']
error_details = ti.xcom_pull(task_ids=task_id_to_check, key='error_details')
if not error_details:
return 'handle_fatal_error'
error_code = error_details.get('error_code', '').strip()
if error_code == 'TRANSPORT_ERROR':
logger.error(f"Fatal Thrift connection error on retry from '{task_id_to_check}'.")
return 'handle_fatal_error'
is_bannable = error_code in ["BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED"]
if is_bannable:
logger.warning(f"Bannable error '{error_code}' on retry. Banning account and reporting failure.")
return 'ban_and_report_after_retry'
logger.error(f"URL failed on retry with code '{error_code}'. Reporting failure and continuing loop.")
return 'report_failure_and_continue'
@task
def ban_and_report_after_retry(retry_data: dict, reason: str, **context):
"""Bans the account used in a failed retry and prepares for failure reporting."""
# The account to ban is the one from the retry attempt.
ban_account(retry_data, reason, **context)
logger.info(f"Account '{retry_data.get('account_id')}' banned after retry failed. Proceeding to report failure.")
return retry_data
@task.branch(trigger_rule='one_failed')
def handle_download_failure_branch(**context):
"""If download or probe fails, routes to the standard failure reporting."""
logger.warning("Download or probe failed. Reporting failure and continuing loop.")
return 'report_failure_and_continue'
@task(trigger_rule='one_success') @task(trigger_rule='one_success')
def coalesce_token_data(get_token_result=None, retry_get_token_result=None): def coalesce_token_data(get_token_result=None, retry_get_token_result=None):
""" """
@ -457,7 +716,7 @@ def coalesce_token_data(get_token_result=None, retry_get_token_result=None):
raise AirflowException("Could not find a successful token result from any attempt.") raise AirflowException("Could not find a successful token result from any attempt.")
# ============================================================================= # =============================================================================
# DAG Definition # DAG Definition with TaskGroups
# ============================================================================= # =============================================================================
with DAG( with DAG(
dag_id='ytdlp_ops_worker_per_url', dag_id='ytdlp_ops_worker_per_url',
@ -476,7 +735,7 @@ with DAG(
'account_pool': Param('default_account', type="string"), 'account_pool': Param('default_account', type="string"),
'account_pool_size': Param(None, type=["integer", "null"]), 'account_pool_size': Param(None, type=["integer", "null"]),
'machine_id': Param(None, type=["string", "null"]), 'machine_id': Param(None, type=["string", "null"]),
'clients': Param('mweb', type="string"), 'clients': Param('web', type="string"),
'timeout': Param(DEFAULT_TIMEOUT, type="integer"), 'timeout': Param(DEFAULT_TIMEOUT, type="integer"),
'download_format': Param('ba[ext=m4a]/bestaudio/best', type="string"), 'download_format': Param('ba[ext=m4a]/bestaudio/best', type="string"),
'output_path_template': Param("%(title)s [%(id)s].%(ext)s", type="string"), 'output_path_template': Param("%(title)s [%(id)s].%(ext)s", type="string"),
@ -488,64 +747,112 @@ with DAG(
'worker_queue': Param(None, type=["string", "null"]), 'worker_queue': Param(None, type=["string", "null"]),
} }
) as dag: ) as dag:
initial_data = get_url_and_assign_account() initial_data = get_url_and_assign_account()
# First attempt at getting token # --- Task Instantiation with TaskGroups ---
first_token_attempt = get_token(initial_data)
# Branch task to handle errors
branch_task = handle_bannable_error_branch.override(trigger_rule='one_failed')(
task_id_to_check=first_token_attempt.operator.task_id
)
# Retry path tasks # Main success/failure handlers (outside groups for clear end points)
ban_task = ban_account.override(task_id='ban_account_and_prepare_for_retry')( fatal_error_task = handle_fatal_error()
initial_data=initial_data, report_failure_task = report_failure_and_continue()
reason="Banned by Airflow worker on first attempt" continue_loop_task = continue_processing_loop()
)
# --- Task Group 1: Initial Attempt ---
with TaskGroup("initial_attempt", tooltip="Initial token acquisition attempt") as initial_attempt_group:
first_token_attempt = get_token(initial_data)
initial_branch_task = handle_bannable_error_branch.override(trigger_rule='one_failed')(
task_id_to_check=first_token_attempt.operator.task_id
)
# Tasks for the "stop_loop" policy on initial attempt
ban_and_report_immediately_task = ban_and_report_immediately.override(task_id='ban_and_report_immediately')(
initial_data=initial_data,
reason="Banned by Airflow worker (policy is stop_loop)"
)
first_token_attempt >> initial_branch_task
initial_branch_task >> [fatal_error_task, ban_and_report_immediately_task]
# --- Task Group 2: Retry Logic ---
with TaskGroup("retry_logic", tooltip="Retry logic with account management") as retry_logic_group:
# Retry path tasks
ban_and_retry_group = ban_and_retry_logic.override(group_id='ban_account_and_prepare_for_retry')(
initial_data=initial_data
)
# This task is for retries after a ban check
after_ban_account_task = assign_new_account_after_ban_check.override(task_id='assign_new_account_after_ban_check')(
initial_data=initial_data
)
# This task is for direct retries (e.g., on connection error)
direct_retry_account_task = assign_new_account_for_direct_retry.override(task_id='assign_new_account_for_direct_retry')(
initial_data=initial_data
)
@task(trigger_rule='one_success')
def coalesce_retry_data(direct_retry_data=None, after_ban_data=None):
"""Coalesces account data from one of the two mutually exclusive retry paths."""
if direct_retry_data:
return direct_retry_data
if after_ban_data:
return after_ban_data
raise AirflowException("Could not find valid account data for retry.")
coalesced_retry_data = coalesce_retry_data(
direct_retry_data=direct_retry_account_task,
after_ban_data=after_ban_account_task
)
retry_token_task = get_token.override(task_id='retry_get_token')(
initial_data=coalesced_retry_data
)
# Retry failure branch and its tasks
retry_branch_task = handle_retry_failure_branch.override(trigger_rule='one_failed')(
task_id_to_check=retry_token_task.operator.task_id
)
ban_after_retry_report_task = ban_and_report_after_retry.override(task_id='ban_and_report_after_retry')(
retry_data=coalesced_retry_data,
reason="Banned by Airflow worker after failed retry"
)
# Internal dependencies within retry group
ban_and_retry_group >> after_ban_account_task
after_ban_account_task >> coalesced_retry_data
direct_retry_account_task >> coalesced_retry_data
coalesced_retry_data >> retry_token_task
retry_token_task >> retry_branch_task
retry_branch_task >> [fatal_error_task, report_failure_task, ban_after_retry_report_task]
ban_after_retry_report_task >> report_failure_task
# --- Task Group 3: Download and Processing ---
with TaskGroup("download_processing", tooltip="Download and media processing") as download_processing_group:
# Coalesce, download, and success tasks
token_data = coalesce_token_data(
get_token_result=first_token_attempt,
retry_get_token_result=retry_token_task
)
download_task = download_and_probe(token_data=token_data)
download_branch_task = handle_download_failure_branch.override(trigger_rule='one_failed')()
success_task = mark_url_as_success(
initial_data=initial_data,
downloaded_file_path=download_task,
token_data=token_data
)
# Internal dependencies within download group
first_token_attempt >> token_data
retry_token_task >> token_data
token_data >> download_task
download_task >> download_branch_task
download_branch_task >> report_failure_task
download_task >> success_task
success_task >> continue_loop_task
# --- DAG Dependencies between TaskGroups ---
# Initial attempt can lead to retry logic or direct failure
initial_branch_task >> [retry_logic_group, fatal_error_task, ban_and_report_immediately_task]
new_account_task = assign_new_account_for_retry.override()( # Retry logic leads to download processing on success or failure reporting on failure
initial_data=initial_data retry_branch_task >> [download_processing_group, report_failure_task]
)
retry_token_task = get_token.override(task_id='retry_get_token')( # Ban and report immediately leads to failure reporting
initial_data=new_account_task ban_and_report_immediately_task >> report_failure_task
)
# Stop path
ban_and_fail_task = ban_and_fail.override()(
initial_data=initial_data,
reason="Banned by Airflow worker (policy is stop_loop)"
)
# Set up dependencies for retry logic
first_token_attempt >> branch_task
branch_task >> ban_task >> new_account_task >> retry_token_task
branch_task >> new_account_task # For policies that don't ban
branch_task >> ban_and_fail_task
# Coalesce results from the two possible token tasks
token_data = coalesce_token_data(
get_token_result=first_token_attempt,
retry_get_token_result=retry_token_task
)
download_task = download_and_probe(token_data=token_data)
success_task = mark_url_as_success(
initial_data=initial_data,
downloaded_file_path=download_task,
token_data=token_data
)
failure_task = handle_generic_failure()
# Main pipeline
token_data >> download_task >> success_task
# On success, trigger a new dispatcher to continue the loop.
success_task >> continue_processing_loop()
# Failure handling
[first_token_attempt, retry_token_task, download_task] >> failure_task

89
airflow/deploy-dl.sh Executable file
View File

@ -0,0 +1,89 @@
#!/bin/bash
set -euo pipefail
# --- Environment Setup ---
ENV=""
# Parse command-line arguments
if [[ "$#" -gt 0 && "$1" == "--env" ]]; then
if [[ -n "$2" && ("$2" == "prod" || "$2" == "test") ]]; then
ENV="$2"
else
echo "Error: Invalid environment specified for deploy-dl.sh. Use 'prod' or 'test'." >&2
exit 1
fi
else
echo "Usage: $0 --env [prod|test]" >&2
exit 1
fi
# --- Configuration ---
SSH_USER="alex_p"
if [[ "$ENV" == "prod" ]]; then
WORKER_SERVERS=("dl003")
elif [[ "$ENV" == "test" ]]; then
WORKER_SERVERS=("dl001")
fi
REMOTE_DEST_PATH="/srv/airflow_dl_worker/"
# List of files and directories to sync from the project root.
# This script assumes it is run from the project root via deploy_all.sh
ROOT_FILES_TO_SYNC=(
"Dockerfile"
"get_info_json_client.py"
"proxy_manager_client.py"
"setup.py"
"VERSION"
"generate_tokens_direct.mjs"
)
AIRFLOW_FILES_TO_SYNC=(
"docker-compose-ytdlp-ops.yaml"
"init-airflow.sh"
)
DIRS_TO_SYNC=(
"airflow/camoufox/"
"airflow/inputfiles/"
"server_fix/"
"token_generator/"
"utils/"
"yt_ops_services/"
)
RSYNC_OPTS="-avz --progress --delete --exclude='__pycache__/' --exclude='*.pyc' --exclude='*.pyo' --exclude='node_modules/'"
echo ">>> Deploying to DL WORKER(S) for environment: $ENV"
# --- Deployment ---
for worker in "${WORKER_SERVERS[@]}"; do
WORKER_HOST="${SSH_USER}@${worker}"
echo "--------------------------------------------------"
echo ">>> Deploying to WORKER: $WORKER_HOST"
echo "--------------------------------------------------"
echo ">>> Creating remote directory on WORKER: $WORKER_HOST"
ssh "$WORKER_HOST" "mkdir -p $REMOTE_DEST_PATH"
echo ">>> Syncing individual files to WORKER..."
for f in "${ROOT_FILES_TO_SYNC[@]}"; do
echo " - Syncing $f"
rsync $RSYNC_OPTS "$f" "$WORKER_HOST:$REMOTE_DEST_PATH"
done
for f in "${AIRFLOW_FILES_TO_SYNC[@]}"; do
echo " - Syncing airflow/$f"
rsync $RSYNC_OPTS "airflow/$f" "$WORKER_HOST:$REMOTE_DEST_PATH"
done
echo ">>> Syncing directories to WORKER..."
for d in "${DIRS_TO_SYNC[@]}"; do
echo " - Syncing $d"
rsync $RSYNC_OPTS "$d" "$WORKER_HOST:$REMOTE_DEST_PATH"
done
echo ">>> Renaming worker compose file on remote..."
ssh "$WORK_HOST" "cd $REMOTE_DEST_PATH && ln -sf docker-compose-ytdlp-ops.yaml docker-compose.yaml"
done
echo ">>> DL WORKER(S) deployment sync complete."
exit 0

77
airflow/deploy-master.sh Executable file
View File

@ -0,0 +1,77 @@
#!/bin/bash
set -euo pipefail
# --- Environment Setup ---
ENV=""
# Parse command-line arguments
if [[ "$#" -gt 0 && "$1" == "--env" ]]; then
if [[ -n "$2" && ("$2" == "prod" || "$2" == "test") ]]; then
ENV="$2"
else
echo "Error: Invalid environment specified for deploy-master.sh. Use 'prod' or 'test'." >&2
exit 1
fi
else
echo "Usage: $0 --env [prod|test]" >&2
exit 1
fi
# --- Configuration ---
SSH_USER="alex_p"
if [[ "$ENV" == "prod" ]]; then
MASTER_SERVER="af-green"
elif [[ "$ENV" == "test" ]]; then
MASTER_SERVER="af-test"
fi
REMOTE_DEST_PATH="/srv/airflow_master/"
MASTER_HOST="${SSH_USER}@${MASTER_SERVER}"
# List of files and directories to sync from the project root.
# This script assumes it is run from the project root via deploy_all.sh
ROOT_FILES_TO_SYNC=(
"Dockerfile"
"get_info_json_client.py"
"proxy_manager_client.py"
"setup.py"
"VERSION"
)
AIRFLOW_FILES_TO_SYNC=(
"docker-compose-master.yaml"
"init-airflow.sh"
"nginx.conf"
)
DIRS_TO_SYNC=(
"airflow/inputfiles/"
"server_fix/"
"yt_ops_services/"
)
RSYNC_OPTS="-avz --progress --delete --exclude='__pycache__/' --exclude='*.pyc' --exclude='*.pyo' --exclude='node_modules/'"
echo ">>> Deploying to MASTER for environment: $ENV"
# --- Deployment ---
echo ">>> Creating remote directory on MASTER: $MASTER_HOST"
ssh "$MASTER_HOST" "mkdir -p $REMOTE_DEST_PATH"
echo ">>> Syncing individual files to MASTER..."
for f in "${ROOT_FILES_TO_SYNC[@]}"; do
rsync $RSYNC_OPTS "$f" "$MASTER_HOST:$REMOTE_DEST_PATH"
done
for f in "${AIRFLOW_FILES_TO_SYNC[@]}"; do
rsync $RSYNC_OPTS "airflow/$f" "$MASTER_HOST:$REMOTE_DEST_PATH"
done
echo ">>> Syncing directories to MASTER..."
for d in "${DIRS_TO_SYNC[@]}"; do
rsync $RSYNC_OPTS "$d" "$MASTER_HOST:$REMOTE_DEST_PATH"
done
echo ">>> Renaming master compose file on remote..."
ssh "$MASTER_HOST" "cd $REMOTE_DEST_PATH && ln -sf docker-compose-master.yaml docker-compose.yaml"
echo ">>> MASTER deployment sync complete."
exit 0

View File

View File

@ -0,0 +1,144 @@
# Airflow remote DL worker configuration.
# This file should be used on a remote machine to run a download worker.
# It requires a master Airflow instance running with services exposed.
#
# Before running, create a .env file in this directory with:
# MASTER_HOST_IP=... a.b.c.d ... # IP address of the machine running docker-compose-master.yaml
# POSTGRES_PASSWORD=... # The password for the PostgreSQL database from the master compose file
# REDIS_PASSWORD=... # The password for Redis from the master compose file
# AIRFLOW_UID=... # User ID for file permissions, should match master
---
x-airflow-common:
&airflow-common
# This should point to the same image used by the master.
# If you built a custom image for master, you need to push it to a registry
# and reference it here.
image: ${AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest}
build: .
# Add extra hosts here to allow workers to resolve other hosts by name.
# This section is auto-generated by Ansible from the inventory.
extra_hosts:
{% for host in groups['all'] %}
- "{{ hostvars[host]['inventory_hostname'] }}:{{ hostvars[host]['ansible_host'] }}"
{% endfor %}
env_file:
- .env
environment:
&airflow-common-env
# Airflow Core
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
AIRFLOW__CORE__FERNET_KEY: '' # Should be same as master, but worker does not need it.
# Backend connections - These should point to the master node
# Set MASTER_HOST_IP, POSTGRES_PASSWORD, and REDIS_PASSWORD in your .env file
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}@${MASTER_HOST_IP}:5432/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT}@${MASTER_HOST_IP}:52909/0
AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}@${MASTER_HOST_IP}:5432/airflow
# Remote Logging - connection is fetched from DB, which is on master
AIRFLOW__LOGGING__REMOTE_LOGGING: "True"
AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs"
AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default
AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False"
AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py"
volumes:
# Mount dags to get any utility scripts, but the worker will pull the DAG from the DB
- ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
# Mount logs locally in case remote logging fails
- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
# Mount config for local settings and other configurations
- ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
# Mount download directories
- ${AIRFLOW_PROJ_DIR:-.}/downloadfiles:/opt/airflow/downloadfiles
- ${AIRFLOW_PROJ_DIR:-.}/addfiles:/opt/airflow/addfiles
- ${AIRFLOW_PROJ_DIR:-.}/inputfiles:/opt/airflow/inputfiles
# Use AIRFLOW_UID and AIRFLOW_GID from .env file to fix permission issues.
user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-0}"
services:
airflow-worker:
<<: *airflow-common
container_name: airflow-dl-worker-1
hostname: ${HOSTNAME:-dl001}
# The worker now listens on the generic queue AND its own dedicated queue.
# The hostname is dynamically inserted into the queue name.
command: airflow celery worker -q queue-dl,queue-dl-${HOSTNAME:-dl001}
deploy:
resources:
limits:
# Increased from 4G to 8G to support higher memory per child process.
memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT:-8G}
reservations:
memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV:-2G}
healthcheck:
test:
- "CMD-SHELL"
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-dl@$$(hostname)"'
interval: 30s
timeout: 30s
retries: 5
start_period: 30s
environment:
<<: *airflow-common-env
HOSTNAME: ${HOSTNAME:-dl001} # Explicitly set inside container
DUMB_INIT_SETSID: "0"
AIRFLOW__CELERY__WORKER_QUEUES: "queue-dl,queue-dl-${HOSTNAME:-dl001}"
AIRFLOW__CELERY__WORKER_TAGS: "dl"
AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
AIRFLOW__CELERY__WORKER_CONCURRENCY: ${AIRFLOW_WORKER_DOWNLOAD_CONCURRENCY:-16}
AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
AIRFLOW__CELERY__WORKER_NAME: "worker-dl@%h"
AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
# Increased from 256MB to 512MB for memory-intensive yt-dlp tasks.
# This value is in KB. 512 * 1024 = 524288.
AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "524288" # 512MB
# The hostname is now managed by Docker Compose to ensure uniqueness when scaling.
# It will be generated based on project, service, and replica number (e.g., airflow-airflow-dl-worker-1).
# hostname: "dl-worker-${HOSTNAME_SUFFIX:-$$(hostname)}"
ports:
- "8793:8793"
networks:
- default
- proxynet
restart: always
airflow-triggerer:
<<: *airflow-common
container_name: airflow-dl-triggerer-1
hostname: ${HOSTNAME}
command: triggerer
healthcheck:
test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
interval: 30s
timeout: 30s
retries: 5
start_period: 60s
environment:
<<: *airflow-common-env
PYTHONASYNCIODEBUG: 1
DUMB_INIT_SETSID: 0
restart: always
docker-socket-proxy:
profiles:
- disabled
image: tecnativa/docker-socket-proxy:0.1.1
environment:
CONTAINERS: 1
IMAGES: 1
AUTH: 1
POST: 1
privileged: true
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
restart: always
networks:
proxynet:
name: airflow_proxynet
external: true

View File

@ -0,0 +1,534 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
#
# WARNING: This configuration is for local development. Do not use it in a production deployment.
#
# This configuration supports basic configuration using environment variables or an .env file
# The following variables are supported:
#
# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
# Default: apache/airflow:2.10.5
# AIRFLOW_UID - User ID in Airflow containers
# Default: 50000
# AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed.
# Default: .
# Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
#
# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
# Default: airflow
# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
# Default: airflow
# _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
# Use this option ONLY for quick checks. Installing requirements at container
# startup is done EVERY TIME the service is started.
# A better way is to build a custom image or extend the official image
# as described in https://airflow.apache.org/docs/docker-stack/build.html.
# Default: ''
#
# Feel free to modify this file to suit your needs.
---
name: airflow-master
x-minio-common: &minio-common
image: quay.io/minio/minio:RELEASE.2025-07-23T15-54-02Z
command: server --console-address ":9001" http://minio{1...3}/data{1...2}
expose:
- "9000"
- "9001"
networks:
- proxynet
env_file:
- .env
environment:
MINIO_ROOT_USER: ${MINIO_ROOT_USER:-admin}
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-0153093693-0009}
healthcheck:
test: ["CMD", "mc", "ready", "local"]
interval: 5s
timeout: 5s
retries: 5
restart: always
x-airflow-common:
&airflow-common
# In order to add custom dependencies or upgrade provider packages you can use your extended image.
# This will build the image from the Dockerfile in this directory and tag it.
image: ${AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest}
build: .
# Add extra hosts here to allow the master services (webserver, scheduler) to resolve
# the hostnames of your remote DL workers. This is crucial for fetching logs.
# Format: - "hostname:ip_address"
# IMPORTANT: This section is auto-generated from cluster.yml
extra_hosts:
- "af-test:89.253.223.97"
- "dl001:109.107.189.106"
env_file:
- .env
networks:
- proxynet
environment:
&airflow-common-env
AIRFLOW__CORE__PARALLELISM: 64
AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG: 32
AIRFLOW__SCHEDULER__PARSING_PROCESSES: 4
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}@postgres/airflow
AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}@postgres/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT}@redis:6379/0
AIRFLOW__CORE__FERNET_KEY: ''
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
# yamllint disable rule:line-length
# Use simple http server on scheduler for health checks
# See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
# yamllint enable rule:line-length
AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
# WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
# for other purpose (development, test and especially production usage) build/extend Airflow image.
#_PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker apache-airflow-providers-http thrift>=0.16.0,<=0.20.0 backoff>=2.2.1 python-dotenv==1.0.1 psutil>=5.9.0} # The following line can be used to set a custom config file, stored in the local config folder
# If you want to use it, outcomment it and replace airflow.cfg with the name of your config file
AIRFLOW__LOGGING__REMOTE_LOGGING: "True"
AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs"
AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default
AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False"
AIRFLOW__LOGGING__REMOTE_LOG_FORMAT: "[%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s"
AIRFLOW__LOGGING__LOG_LEVEL: "INFO"
AIRFLOW__LOGGING__LOG_FILENAME_TEMPLATE: "{{ ti.dag_id }}/{{ ti.run_id }}/{{ ti.task_id }}/attempt={{ try_number }}.log"
AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py"
volumes:
- ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
- ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
- ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
- ${AIRFLOW_PROJ_DIR:-.}/downloadfiles:/opt/airflow/downloadfiles
- ${AIRFLOW_PROJ_DIR:-.}/addfiles:/opt/airflow/addfiles
- ${AIRFLOW_PROJ_DIR:-.}/inputfiles:/opt/airflow/inputfiles
user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-0}"
depends_on:
&airflow-common-depends-on
redis:
condition: service_healthy
postgres:
condition: service_healthy
nginx-minio-lb:
condition: service_healthy
services:
postgres:
image: postgres:13
env_file:
- .env
networks:
- proxynet
environment:
POSTGRES_USER: airflow
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}
POSTGRES_DB: airflow
volumes:
- postgres-db-volume:/var/lib/postgresql/data
ports:
- "5432:5432"
healthcheck:
test: ["CMD", "pg_isready", "-U", "airflow"]
interval: 10s
retries: 5
start_period: 5s
restart: always
redis:
# Redis is limited to 7.2-bookworm due to licencing change
# https://redis.io/blog/redis-adopts-dual-source-available-licensing/
image: redis:7.2-bookworm
env_file:
- .env
networks:
- proxynet
command: sh -c "redis-server --requirepass ${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} --bind 0.0.0.0 --save 60 1 --loglevel warning --appendonly yes"
volumes:
- ./redis-data:/data
expose:
- 6379
ports:
- "52909:6379"
healthcheck:
test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT}", "ping"]
interval: 10s
timeout: 30s
retries: 50
start_period: 30s
restart: always
redis-proxy-account-clear:
image: redis:7.2-bookworm
container_name: redis-proxy-account-clear
env_file:
- .env
networks:
- proxynet
command: >
sh -c "
echo 'Clearing proxy and account statuses from Redis...';
redis-cli -h redis -a $${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} --scan --pattern 'proxy_status:*' | xargs -r redis-cli -h redis -a $${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} DEL;
redis-cli -h redis -a $${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} --scan --pattern 'account_status:*' | xargs -r redis-cli -h redis -a $${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} DEL;
echo 'Redis cleanup complete.'
"
depends_on:
redis:
condition: service_healthy
minio1:
<<: *minio-common
hostname: minio1
volumes:
- ./minio-data/1/1:/data1
- ./minio-data/1/2:/data2
minio2:
<<: *minio-common
hostname: minio2
volumes:
- ./minio-data/2/1:/data1
- ./minio-data/2/2:/data2
depends_on:
minio1:
condition: service_started
minio3:
<<: *minio-common
hostname: minio3
volumes:
- ./minio-data/3/1:/data1
- ./minio-data/3/2:/data2
depends_on:
minio2:
condition: service_started
nginx-minio-lb:
image: nginx:1.19.2-alpine
hostname: nginx-minio-lb
networks:
- proxynet
command: sh -c "apk add --no-cache curl >/dev/null 2>&1 && exec nginx -g 'daemon off;'"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
ports:
- "9000:9000"
- "9001:9001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9001/minio/health/live"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
depends_on:
minio1:
condition: service_healthy
minio2:
condition: service_healthy
minio3:
condition: service_healthy
restart: always
minio-init:
image: minio/mc
container_name: minio-init
networks:
- proxynet
depends_on:
nginx-minio-lb:
condition: service_healthy
entrypoint: >
/bin/sh -c "
set -e;
/usr/bin/mc alias set minio http://nginx-minio-lb:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD;
# Retry loop for bucket creation
MAX_ATTEMPTS=10
SUCCESS=false
# Use a for loop for robustness, as it's generally more portable than `until`.
for i in $$(seq 1 $$MAX_ATTEMPTS); do
# Check if the bucket exists. If so, we're done.
if /usr/bin/mc ls minio/airflow-logs > /dev/null 2>&1; then
echo 'MinIO bucket already exists.'
SUCCESS=true
break
fi
# If not, try to create it. If successful, we're done.
# We redirect output because `mc mb` can error if another process creates it in the meantime.
if /usr/bin/mc mb minio/airflow-logs > /dev/null 2>&1; then
echo 'MinIO bucket created.'
SUCCESS=true
break
fi
# If we reach here, both checks failed. Wait and retry.
echo "Attempt $$i/$$MAX_ATTEMPTS: Waiting for MinIO bucket..."
sleep 2
done
# After the loop, check if we succeeded.
if [ "$$SUCCESS" = "false" ]; then
echo "Failed to create MinIO bucket after $$MAX_ATTEMPTS attempts."
exit 1
fi
/usr/bin/mc anonymous set download minio/airflow-logs;
echo 'MinIO initialized: bucket airflow-logs created and policy set to download.';
"
env_file:
- .env
environment:
MINIO_ROOT_USER: ${MINIO_ROOT_USER:-admin}
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-0153093693-0009}
restart: on-failure
nginx-healthcheck:
image: nginx:alpine
container_name: nginx-healthcheck
networks:
- proxynet
ports:
- "8888:80"
restart: always
airflow-webserver:
<<: *airflow-common
command: webserver
ports:
- "8080:8080"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-scheduler:
<<: *airflow-common
command: scheduler
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-master-worker:
<<: *airflow-common
command: airflow celery worker -q main,default
healthcheck:
# yamllint disable rule:line-length
test:
- "CMD-SHELL"
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-master@$$(hostname)"'
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
environment:
<<: *airflow-common-env
# Required to handle warm shutdown of the celery workers properly
# See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
DUMB_INIT_SETSID: 0
AIRFLOW__CELERY__WORKER_QUEUES: "main,default"
AIRFLOW__CELERY__WORKER_TAGS: "master"
AIRFLOW__CELERY__WORKER_CONCURRENCY: "16"
AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
AIRFLOW__CELERY__WORKER_NAME: "worker-master@%h"
AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
# Max memory per child process before it's recycled. Helps prevent memory leaks.
# 256MB is sufficient for master worker tasks. DL workers use a higher limit.
AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "262144" # 256MB
hostname: ${HOSTNAME}
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-triggerer:
<<: *airflow-common
command: triggerer
healthcheck:
test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-init:
<<: *airflow-common
depends_on:
<<: *airflow-common-depends-on
minio-init:
condition: service_completed_successfully
redis-proxy-account-clear:
condition: service_completed_successfully
entrypoint: /bin/bash
# yamllint disable rule:line-length
command:
- -c
- |
# This container runs as root and is responsible for initializing the environment.
# It sets permissions on mounted directories to ensure the 'airflow' user (running with AIRFLOW_UID)
# can write to them. This is crucial for logs, dags, and plugins.
echo "Initializing permissions for Airflow directories..."
chown -R "${AIRFLOW_UID}:${AIRFLOW_GID}" /opt/airflow/dags /opt/airflow/logs /opt/airflow/plugins /opt/airflow/config /opt/airflow/downloadfiles /opt/airflow/addfiles /opt/airflow/inputfiles
echo "Permissions set."
if [[ -z "${AIRFLOW_UID}" ]]; then
echo
echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
echo "If you are on Linux, you SHOULD follow the instructions below to set "
echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
echo "For other operating systems you can get rid of the warning with manually created .env file:"
echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
echo
fi
# This container's job is to initialize the database, create a user, and import connections.
# Wait for db to be ready.
airflow db check --retry 30 --retry-delay 5
# Run database migrations.
echo "Running database migrations..."
airflow db upgrade
echo "Database migrations complete."
# Create the admin user if it doesn't exist.
# The '|| true' prevents the script from failing if the user already exists.
echo "Checking for and creating admin user..."
airflow users create \
--username "admin" \
--password "${AIRFLOW_ADMIN_PASSWORD:-admin_pwd_X9yZ3aB1cE5dF7gH}" \
--firstname Admin \
--lastname User \
--role Admin \
--email admin@example.com || true
echo "Admin user check/creation complete."
# Import connections from any .json file in the config directory.
echo "Searching for connection files in /opt/airflow/config..."
if [ -d "/opt/airflow/config" ] && [ -n "$(ls -A /opt/airflow/config/*.json 2>/dev/null)" ]; then
for conn_file in /opt/airflow/config/*.json; do
if [ -f "$$conn_file" ]; then
# Exclude files that are not meant to be Airflow connections.
if [ "$(basename "$$conn_file")" = "camoufox_endpoints.json" ]; then
echo "Skipping '$$conn_file' as it is not an Airflow connection file."
continue
fi
echo "Importing connections from $$conn_file"
airflow connections import "$$conn_file" || echo "Failed to import $$conn_file, but continuing."
fi
done
else
echo "No connection files found to import, or /opt/airflow/config is empty/missing."
fi
echo "Connection import process complete."
# yamllint enable rule:line-length
environment:
<<: *airflow-common-env
_AIRFLOW_DB_MIGRATE: 'true'
_AIRFLOW_WWW_USER_CREATE: 'false' # Set to false as we handle it manually
_PIP_ADDITIONAL_REQUIREMENTS: ''
user: "0:0"
airflow-cli:
<<: *airflow-common
profiles:
- debug
environment:
<<: *airflow-common-env
CONNECTION_CHECK_MAX_COUNT: "0"
# Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
command:
- bash
- -c
- airflow
# You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
# or by explicitly targeted on the command line e.g. docker-compose up flower.
# See: https://docs.docker.com/compose/profiles/
flower:
<<: *airflow-common
command: celery flower
ports:
- "5555:5555"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
docker-socket-proxy:
profiles:
- disabled
image: tecnativa/docker-socket-proxy:0.1.1
networks:
- proxynet
environment:
CONTAINERS: 1
IMAGES: 1
AUTH: 1
POST: 1
privileged: true
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
restart: always
volumes:
postgres-db-volume:
networks:
proxynet:
name: airflow_proxynet
external: true

View File

@ -0,0 +1,546 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
#
# WARNING: This configuration is for local development. Do not use it in a production deployment.
#
# This configuration supports basic configuration using environment variables or an .env file
# The following variables are supported:
#
# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
# Default: apache/airflow:2.10.5
# AIRFLOW_UID - User ID in Airflow containers
# Default: 50000
# AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed.
# Default: .
# Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
#
# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
# Default: airflow
# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
# Default: airflow
# _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
# Use this option ONLY for quick checks. Installing requirements at container
# startup is done EVERY TIME the service is started.
# A better way is to build a custom image or extend the official image
# as described in https://airflow.apache.org/docs/docker-stack/build.html.
# Default: ''
#
# Feel free to modify this file to suit your needs.
---
name: airflow-master
x-minio-common: &minio-common
image: quay.io/minio/minio:RELEASE.2025-07-23T15-54-02Z
command: server --console-address ":9001" http://minio{1...3}/data{1...2}
expose:
- "9000"
- "9001"
networks:
- proxynet
env_file:
- .env
environment:
MINIO_ROOT_USER: ${{ '{' }}MINIO_ROOT_USER:-admin{{ '}' }}
MINIO_ROOT_PASSWORD: ${{ '{' }}MINIO_ROOT_PASSWORD:-0153093693-0009{{ '}' }}
healthcheck:
test: ["CMD", "mc", "ready", "local"]
interval: 5s
timeout: 5s
retries: 5
restart: always
x-airflow-common:
&airflow-common
# In order to add custom dependencies or upgrade provider packages you can use your extended image.
# This will build the image from the Dockerfile in this directory and tag it.
image: ${{ '{' }}AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest{{ '}' }}
build: .
# Add extra hosts here to allow the master services (webserver, scheduler) to resolve
# the hostnames of your remote DL workers. This is crucial for fetching logs.
# Format: - "hostname:ip_address"
# IMPORTANT: This section is auto-generated from cluster.yml
extra_hosts:
{% for host_name, host_ip in all_hosts.items() %}
- "{{ host_name }}:{{ host_ip }}"
{% endfor %}
env_file:
- .env
networks:
- proxynet
environment:
&airflow-common-env
AIRFLOW__CORE__PARALLELISM: 64
AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG: 32
AIRFLOW__SCHEDULER__PARSING_PROCESSES: 4
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${{ '{' }}POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP{{ '}' }}@postgres/airflow
AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:${{ '{' }}POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP{{ '}' }}@postgres/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }}@redis:6379/0
AIRFLOW__CORE__FERNET_KEY: ''
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
# yamllint disable rule:line-length
# Use simple http server on scheduler for health checks
# See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
# yamllint enable rule:line-length
AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
# WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
# for other purpose (development, test and especially production usage) build/extend Airflow image.
#_PIP_ADDITIONAL_REQUIREMENTS: ${{ '{' }}_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker apache-airflow-providers-http thrift>=0.16.0,<=0.20.0 backoff>=2.2.1 python-dotenv==1.0.1 psutil>=5.9.0{{ '}' }} # The following line can be used to set a custom config file, stored in the local config folder
# If you want to use it, outcomment it and replace airflow.cfg with the name of your config file
AIRFLOW__LOGGING__REMOTE_LOGGING: "True"
AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs"
AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default
AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False"
{% raw %}
AIRFLOW__LOGGING__REMOTE_LOG_FORMAT: "[%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s"
AIRFLOW__LOGGING__LOG_LEVEL: "INFO"
AIRFLOW__LOGGING__LOG_FILENAME_TEMPLATE: "{{ ti.dag_id }}/{{ ti.run_id }}/{{ ti.task_id }}/attempt={{ try_number }}.log"
{% endraw %}
AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py"
volumes:
- ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/dags:/opt/airflow/dags
- ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/logs:/opt/airflow/logs
- ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/config:/opt/airflow/config
- ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/plugins:/opt/airflow/plugins
- ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/downloadfiles:/opt/airflow/downloadfiles
- ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/addfiles:/opt/airflow/addfiles
- ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/inputfiles:/opt/airflow/inputfiles
user: "${{ '{' }}AIRFLOW_UID:-50000{{ '}' }}:${{ '{' }}AIRFLOW_GID:-0{{ '}' }}"
depends_on:
&airflow-common-depends-on
redis:
condition: service_healthy
postgres:
condition: service_healthy
nginx-minio-lb:
condition: service_healthy
services:
postgres:
image: postgres:13
env_file:
- .env
networks:
- proxynet
environment:
POSTGRES_USER: airflow
POSTGRES_PASSWORD: ${{ '{' }}POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP{{ '}' }}
POSTGRES_DB: airflow
volumes:
- postgres-db-volume:/var/lib/postgresql/data
ports:
- "{{ postgres_port }}:5432"
healthcheck:
test: ["CMD", "pg_isready", "-U", "airflow"]
interval: 10s
retries: 5
start_period: 5s
restart: always
redis:
# Redis is limited to 7.2-bookworm due to licencing change
# https://redis.io/blog/redis-adopts-dual-source-available-licensing/
image: redis:7.2-bookworm
env_file:
- .env
networks:
- proxynet
command:
- "redis-server"
- "--requirepass"
- "${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }}"
- "--bind"
- "*"
- "--protected-mode"
- "no"
- "--save"
- "60"
- "1"
- "--loglevel"
- "warning"
- "--appendonly"
- "yes"
volumes:
- ./redis-data:/data
expose:
- 6379
ports:
- "{{ redis_port }}:6379"
healthcheck:
test: ["CMD", "redis-cli", "-a", "${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }}", "ping"]
interval: 10s
timeout: 30s
retries: 50
start_period: 30s
restart: always
redis-proxy-account-clear:
image: redis:7.2-bookworm
container_name: redis-proxy-account-clear
env_file:
- .env
networks:
- proxynet
command: >
sh -c "
echo 'Clearing proxy and account statuses from Redis...';
redis-cli -h redis -a $${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }} --scan --pattern 'proxy_status:*' | xargs -r redis-cli -h redis -a $${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }} DEL;
redis-cli -h redis -a $${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }} --scan --pattern 'account_status:*' | xargs -r redis-cli -h redis -a $${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }} DEL;
echo 'Redis cleanup complete.'
"
depends_on:
redis:
condition: service_healthy
minio1:
<<: *minio-common
hostname: minio1
volumes:
- ./minio-data/1/1:/data1
- ./minio-data/1/2:/data2
minio2:
<<: *minio-common
hostname: minio2
volumes:
- ./minio-data/2/1:/data1
- ./minio-data/2/2:/data2
depends_on:
minio1:
condition: service_started
minio3:
<<: *minio-common
hostname: minio3
volumes:
- ./minio-data/3/1:/data1
- ./minio-data/3/2:/data2
depends_on:
minio2:
condition: service_started
nginx-minio-lb:
image: nginx:1.19.2-alpine
hostname: nginx-minio-lb
networks:
- proxynet
command: sh -c "apk add --no-cache curl >/dev/null 2>&1 && exec nginx -g 'daemon off;'"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
ports:
- "9000:9000"
- "9001:9001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9001/minio/health/live"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
depends_on:
minio1:
condition: service_healthy
minio2:
condition: service_healthy
minio3:
condition: service_healthy
restart: always
minio-init:
image: minio/mc
container_name: minio-init
networks:
- proxynet
depends_on:
nginx-minio-lb:
condition: service_healthy
entrypoint: >
/bin/sh -c "
set -e;
/usr/bin/mc alias set minio http://nginx-minio-lb:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD;
# Retry loop for bucket creation
MAX_ATTEMPTS=10
SUCCESS=false
# Use a for loop for robustness, as it's generally more portable than `until`.
for i in $$(seq 1 $$MAX_ATTEMPTS); do
# Check if the bucket exists. If so, we're done.
if /usr/bin/mc ls minio/airflow-logs > /dev/null 2>&1; then
echo 'MinIO bucket already exists.'
SUCCESS=true
break
fi
# If not, try to create it. If successful, we're done.
# We redirect output because `mc mb` can error if another process creates it in the meantime.
if /usr/bin/mc mb minio/airflow-logs > /dev/null 2>&1; then
echo 'MinIO bucket created.'
SUCCESS=true
break
fi
# If we reach here, both checks failed. Wait and retry.
echo "Attempt $$i/$$MAX_ATTEMPTS: Waiting for MinIO bucket..."
sleep 2
done
# After the loop, check if we succeeded.
if [ "$$SUCCESS" = "false" ]; then
echo "Failed to create MinIO bucket after $$MAX_ATTEMPTS attempts."
exit 1
fi
/usr/bin/mc anonymous set download minio/airflow-logs;
echo 'MinIO initialized: bucket airflow-logs created and policy set to download.';
"
env_file:
- .env
environment:
MINIO_ROOT_USER: ${{ '{' }}MINIO_ROOT_USER:-admin{{ '}' }}
MINIO_ROOT_PASSWORD: ${{ '{' }}MINIO_ROOT_PASSWORD:-0153093693-0009{{ '}' }}
restart: on-failure
nginx-healthcheck:
image: nginx:alpine
container_name: nginx-healthcheck
networks:
- proxynet
ports:
- "8888:80"
restart: always
airflow-webserver:
<<: *airflow-common
command: webserver
ports:
- "8080:8080"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-scheduler:
<<: *airflow-common
command: scheduler
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-master-worker:
<<: *airflow-common
command: airflow celery worker -q main,default
healthcheck:
# yamllint disable rule:line-length
test:
- "CMD-SHELL"
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-master@$$(hostname)"'
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
environment:
<<: *airflow-common-env
# Required to handle warm shutdown of the celery workers properly
# See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
DUMB_INIT_SETSID: 0
AIRFLOW__CELERY__WORKER_QUEUES: "main,default"
AIRFLOW__CELERY__WORKER_TAGS: "master"
AIRFLOW__CELERY__WORKER_CONCURRENCY: "16"
AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
AIRFLOW__CELERY__WORKER_NAME: "worker-master@%h"
AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
# Max memory per child process before it's recycled. Helps prevent memory leaks.
# 256MB is sufficient for master worker tasks. DL workers use a higher limit.
AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "262144" # 256MB
hostname: ${{ '{' }}HOSTNAME{{ '}' }}
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-triggerer:
<<: *airflow-common
command: triggerer
healthcheck:
test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${{ '{' }}HOSTNAME{{ '}' }}"']
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-init:
<<: *airflow-common
depends_on:
<<: *airflow-common-depends-on
minio-init:
condition: service_completed_successfully
redis-proxy-account-clear:
condition: service_completed_successfully
entrypoint: /bin/bash
# yamllint disable rule:line-length
command:
- -c
- |
# This container runs as root and is responsible for initializing the environment.
# It sets permissions on mounted directories to ensure the 'airflow' user (running with AIRFLOW_UID)
# can write to them. This is crucial for logs, dags, and plugins.
echo "Initializing permissions for Airflow directories..."
chown -R "${{ '{' }}AIRFLOW_UID{{ '}' }}:${{ '{' }}AIRFLOW_GID{{ '}' }}" /opt/airflow/dags /opt/airflow/logs /opt/airflow/plugins /opt/airflow/config /opt/airflow/downloadfiles /opt/airflow/addfiles /opt/airflow/inputfiles
echo "Permissions set."
if [[ -z "${{ '{' }}AIRFLOW_UID{{ '}' }}" ]]; then
echo
echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
echo "If you are on Linux, you SHOULD follow the instructions below to set "
echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
echo "For other operating systems you can get rid of the warning with manually created .env file:"
echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
echo
fi
# This container's job is to initialize the database, create a user, and import connections.
# Wait for db to be ready.
airflow db check --retry 30 --retry-delay 5
# Run database migrations.
echo "Running database migrations..."
airflow db upgrade
echo "Database migrations complete."
# Create the admin user if it doesn't exist.
# The '|| true' prevents the script from failing if the user already exists.
echo "Checking for and creating admin user..."
airflow users create \
--username "admin" \
--password "${{ '{' }}AIRFLOW_ADMIN_PASSWORD:-admin_pwd_X9yZ3aB1cE5dF7gH{{ '}' }}" \
--firstname Admin \
--lastname User \
--role Admin \
--email admin@example.com || true
echo "Admin user check/creation complete."
# Import connections from any .json file in the config directory.
echo "Searching for connection files in /opt/airflow/config..."
if [ -d "/opt/airflow/config" ] && [ -n "$(ls -A /opt/airflow/config/*.json 2>/dev/null)" ]; then
for conn_file in /opt/airflow/config/*.json; do
if [ -f "$$conn_file" ]; then
# Exclude files that are not meant to be Airflow connections.
if [ "$(basename "$$conn_file")" = "camoufox_endpoints.json" ]; then
echo "Skipping '$$conn_file' as it is not an Airflow connection file."
continue
fi
echo "Importing connections from $$conn_file"
airflow connections import "$$conn_file" || echo "Failed to import $$conn_file, but continuing."
fi
done
else
echo "No connection files found to import, or /opt/airflow/config is empty/missing."
fi
echo "Connection import process complete."
# yamllint enable rule:line-length
environment:
<<: *airflow-common-env
_AIRFLOW_DB_MIGRATE: 'true'
_AIRFLOW_WWW_USER_CREATE: 'false' # Set to false as we handle it manually
_PIP_ADDITIONAL_REQUIREMENTS: ''
user: "0:0"
airflow-cli:
<<: *airflow-common
profiles:
- debug
environment:
<<: *airflow-common-env
CONNECTION_CHECK_MAX_COUNT: "0"
# Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
command:
- bash
- -c
- airflow
# You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
# or by explicitly targeted on the command line e.g. docker-compose up flower.
# See: https://docs.docker.com/compose/profiles/
flower:
<<: *airflow-common
command: celery flower
ports:
- "5555:5555"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
docker-socket-proxy:
profiles:
- disabled
image: tecnativa/docker-socket-proxy:0.1.1
networks:
- proxynet
environment:
CONTAINERS: 1
IMAGES: 1
AUTH: 1
POST: 1
privileged: true
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
restart: always
volumes:
postgres-db-volume:
networks:
proxynet:
name: airflow_proxynet
external: true

View File

@ -0,0 +1,101 @@
name: ytdlp-ops
{% if service_role != 'management' %}
include:
# This automatically includes the generated camoufox service definitions and dependencies.
# It simplifies the docker-compose command, as you no longer need to specify both files with -f.
# The file is generated by the config-generator service and will be created even if empty.
- docker-compose.camoufox.yaml
{% endif %}
services:
envoy:
image: envoyproxy/envoy:v1.29-latest
container_name: envoy-thrift-lb
restart: unless-stopped
volumes:
# Mount the generated config file from the host
- ./envoy.yaml:/etc/envoy/envoy.yaml:ro
ports:
# This is the single public port for all Thrift traffic
- "${ENVOY_PORT:-9080}:${ENVOY_PORT:-9080}"
# Expose the admin port for debugging
- "${ENVOY_ADMIN_PORT:-9901}:${ENVOY_ADMIN_PORT:-9901}"
networks:
- proxynet
# This service depends on ytdlp-ops-service, which in turn waits for camoufox.
depends_on:
- ytdlp-ops-service
ytdlp-ops-service:
image: pangramia/ytdlp-ops-server:latest # Don't comment out or remove, build is performed externally
# container_name is omitted; Docker will use the service name for DNS.
# This service depends on the camoufox-group service, which ensures all camoufox
# instances are started before this service.
{% if service_role != 'management' %}
depends_on:
- camoufox-group
{% endif %}
# Ports are no longer exposed directly. Envoy will connect to them on the internal network.
env_file:
- ./.env # Path is relative to the compose file
volumes:
- context-data:/app/context-data
# Mount the generated endpoints file to make it available to the server
- ./camoufox/camoufox_endpoints.json:/app/config/camoufox_endpoints.json:ro
# Mount the plugin source code for live updates without rebuilding the image.
# Assumes the plugin source is in a 'bgutil-ytdlp-pot-provider' directory
# next to your docker-compose.yaml file.
#- ./bgutil-ytdlp-pot-provider:/app/bgutil-ytdlp-pot-provider
networks:
- proxynet
command:
# --- Parameters for ALL service roles ---
- "--port"
- "${YTDLP_BASE_PORT:-9090}"
- "--timeout"
- "${YTDLP_TIMEOUT:-600}"
- "--workers"
- "${YTDLP_WORKERS:-3}"
- "--verbose"
- "--server-identity"
- "${SERVER_IDENTITY:-ytdlp-ops-airflow-service}"
- "--redis-host"
- "${REDIS_HOST:-redis}"
- "--redis-port"
- "${REDIS_PORT:-6379}"
- "--redis-password"
- "${REDIS_PASSWORD}"
- "--account-active-duration-min"
- "${ACCOUNT_ACTIVE_DURATION_MIN:-30}"
- "--account-cooldown-duration-min"
- "${ACCOUNT_COOLDOWN_DURATION_MIN:-60}"
- "--service-role"
- "{{ service_role }}"
{% if service_role != 'management' %}
# --- Parameters for worker/all-in-one roles ONLY ---
- "--script-dir"
- "/app"
- "--context-dir"
- "/app/context-data"
- "--clean-context-dir"
- "--clients"
- "${YT_CLIENTS:-web,mweb,ios,android}"
- "--proxies"
- "{{ combined_proxies_str }}"
- "--camoufox-endpoints-file"
- "/app/config/camoufox_endpoints.json"
- "--print-tokens"
- "--stop-if-no-proxy"
{% endif %}
restart: unless-stopped
pull_policy: always
volumes:
context-data:
name: context-data
networks:
proxynet:
name: airflow_proxynet
external: true

View File

@ -0,0 +1,47 @@
# THIS FILE IS AUTO-GENERATED BY generate_envoy_config.py
# DO NOT EDIT MANUALLY.
#
# It contains the service definitions for the camoufox instances
# and adds the necessary dependencies to the main services.
services:
camoufox-1:
image: ghcr.io/safing/camoufox:latest
container_name: ytdlp-ops-camoufox-1-1
restart: unless-stopped
ports:
- "12345:12345"
environment:
- DISPLAY=:99
- CAMOUFOX_MAX_MEMORY_MB=2048
- CAMOUFOX_MAX_CONCURRENT_CONTEXTS=8
- CAMOUFOX_RESTART_THRESHOLD_MB=1500
volumes:
- /tmp/.X11-unix:/tmp/.X11-unix:rw
- camoufox-data-1:/app/context-data
command: [
"--ws-host", "0.0.0.0",
"--port", "12345",
"--ws-path", "mypath",
"--headless",
"--monitor-resources",
"--memory-restart-threshold", "1800"
]
deploy:
resources:
limits:
memory: 2.5G
logging:
driver: "json-file"
options:
max-size: "100m"
max-file: "3"
networks:
- camoufox-network
volumes:
camoufox-data-1:
networks:
camoufox-network:
driver: bridge

View File

@ -0,0 +1,57 @@
# THIS FILE IS AUTO-GENERATED BY generate_envoy_config.py
# DO NOT EDIT MANUALLY.
#
# It contains the service definitions for the camoufox instances
# and adds the necessary dependencies to the main services.
services:
{% for proxy in proxies %}
camoufox-{{ loop.index }}:
build:
context: ./camoufox
dockerfile: Dockerfile
args:
VNC_PASSWORD: "{{ vnc_password }}"
shm_size: 2gb # Increase shared memory for browser stability
volumes:
- camoufox-data-{{ loop.index }}:/app/persistent-data
ports:
- "{{ base_vnc_port + loop.index - 1 }}:5900"
networks:
- proxynet
command: [
"--ws-host", "0.0.0.0",
"--port", "12345",
"--ws-path", "mypath",
"--proxy-url", "{{ proxy.url }}",
"--locale", "en-US",
"--geoip",
"--extensions", "/app/extensions/google_sign_in_popup_blocker-1.0.2.xpi,/app/extensions/spoof_timezone-0.3.4.xpi,/app/extensions/youtube_ad_auto_skipper-0.6.0.xpi",
"--persistent-context",
"--user-data-dir", "/app/persistent-data",
"--preferences", "security.sandbox.content.level=0,layers.acceleration.disabled=true,cookiebanners.service.mode=2,cookiebanners.service.mode.privateBrowsing=2,network.cookie.lifetimePolicy=0,network.cookie.thirdparty.sessionOnly=false,network.cookie.cookieBehavior=0,network.cookie.alwaysAcceptSessionCookies=true",
"--num-instances", "{{ num_instances | default(4) }}",
"--monitor-resources"
]
restart: unless-stopped
{% endfor %}
{% if proxies %}
# This service is a dependency anchor. The main services depend on it,
# and it in turn depends on all camoufox instances.
camoufox-group:
image: alpine:3.19
command: ["echo", "Camoufox dependency group ready."]
restart: "no"
networks:
- proxynet
depends_on:
{% for proxy in proxies %}
camoufox-{{ loop.index }}:
condition: service_started
{% endfor %}
{% endif %}
volumes:
{% for proxy in proxies %}
camoufox-data-{{ loop.index }}:
{% endfor %}

View File

@ -0,0 +1,14 @@
# This file is used to generate the necessary configuration files for the main application stack.
# It should be run as a one-off command before starting the main services.
# Example: docker-compose -f airflow/docker-compose.config-generate.yaml run --rm config-generator
services:
config-generator:
image: python:3.12-slim
working_dir: /app
env_file:
- ./.env
volumes:
# Mount the entire airflow directory to access scripts and write output files
- ./:/app
command: >
sh -c "pip install jinja2 && python3 generate_envoy_config.py"

54
airflow/envoy.yaml.j2 Normal file
View File

@ -0,0 +1,54 @@
# Jinja2 template for Envoy configuration
admin:
address:
socket_address:
address: 0.0.0.0
port_value: {{ envoy_admin_port }}
static_resources:
listeners:
# Listener for ytdlp-ops Thrift traffic
- name: ytdlp_ops_listener
address:
socket_address:
address: 0.0.0.0
port_value: {{ envoy_port }}
filter_chains:
- filters:
- name: envoy.filters.network.thrift_proxy
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.thrift_proxy.v3.ThriftProxy
stat_prefix: thrift_ingress
transport: FRAMED
protocol: BINARY
route_config:
name: local_route
routes:
- match:
method_name: ""
route:
cluster: ytdlp_ops_cluster
clusters:
# Cluster for the ytdlp-ops workers
- name: ytdlp_ops_cluster
connect_timeout: 5s
type: {{ envoy_cluster_type }}
lb_policy: ROUND_ROBIN
health_checks:
- timeout: 1s
interval: 5s
unhealthy_threshold: 3
healthy_threshold: 2
tcp_health_check: {}
load_assignment:
cluster_name: ytdlp_ops_cluster
endpoints:
- lb_endpoints:
{% for i in range(worker_count) %}
- endpoint:
address:
socket_address:
address: {{ backend_address }}
port_value: {{ base_port + i }}
{% endfor %}

View File

@ -0,0 +1,340 @@
#!/usr/bin/env python3
import os
import sys
import json
import re
try:
from jinja2 import Environment, FileSystemLoader
except ImportError:
print("FATAL: jinja2 is not installed. Please run 'pip install jinja2'.", file=sys.stderr)
exit(1)
import logging
import ipaddress
from typing import Optional
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def is_ip_address(address: str) -> bool:
"""Checks if a given string is a valid IP address (IPv4 or IPv6)."""
if not address:
return False
try:
ipaddress.ip_address(address)
return True
except ValueError:
return False
def load_dotenv(dotenv_path):
"""
Loads environment variables from a .env file.
Does not override existing environment variables from the system.
"""
if not os.path.exists(dotenv_path):
logging.warning(f".env file not found at {dotenv_path}. Using system environment variables or defaults.")
return
try:
with open(dotenv_path) as f:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
key, value = line.split('=', 1)
key = key.strip()
value = value.strip()
# Remove surrounding quotes which are common in .env files
if (value.startswith('"') and value.endswith('"')) or \
(value.startswith("'") and value.endswith("'")):
value = value[1:-1]
# os.environ only takes strings
value = str(value)
if key not in os.environ:
os.environ[key] = value
logging.info(f"Successfully loaded variables from {dotenv_path}")
except Exception as e:
logging.error(f"Failed to read or parse {dotenv_path}: {e}")
# Continue, will use defaults or system env vars
def _get_port_from_proxy_url(url: str) -> Optional[str]:
"""Extracts the port from a proxy URL string."""
if not url or not isinstance(url, str):
return None
match = re.search(r':(\d+)$', url.strip())
return match.group(1) if match else None
def expand_env_vars(value: str) -> str:
"""
Expands environment variables in a string, including default values.
Supports ${VAR} and ${VAR:-default}.
"""
if not isinstance(value, str):
return value
# Regex to find ${VAR:-default} or ${VAR}
pattern = re.compile(r'\$\{(?P<var>\w+)(?::-(?P<default>.*?))?\}')
def replacer(match):
var_name = match.group('var')
default_value = match.group('default')
# Get value from os.environ, or use default, or empty string
return os.getenv(var_name, default_value if default_value is not None else '')
return pattern.sub(replacer, value)
def generate_configs():
"""
Generates envoy.yaml, docker-compose.camoufox.yaml, and camoufox_endpoints.json
from Jinja2 templates and environment variables.
"""
try:
# --- Load .env file ---
script_dir = os.path.dirname(os.path.abspath(__file__))
dotenv_path = os.path.join(script_dir, '.env')
load_dotenv(dotenv_path)
# --- Common Configuration ---
ytdlp_workers_str = os.getenv('YTDLP_WORKERS', '3').strip()
try:
# Handle empty string case by defaulting to 3, otherwise convert to int.
worker_count = int(ytdlp_workers_str) if ytdlp_workers_str else 3
except (ValueError, TypeError):
logging.warning(f"Invalid value for YTDLP_WORKERS: '{ytdlp_workers_str}'. Defaulting to 3.")
worker_count = 3
if worker_count == 0:
worker_count = os.cpu_count() or 1
logging.info(f"YTDLP_WORKERS is 0, auto-detected {worker_count} CPU cores for worker and camoufox config.")
config_dir = os.path.join(script_dir, 'config')
os.makedirs(config_dir, exist_ok=True)
env = Environment(loader=FileSystemLoader(script_dir), trim_blocks=True, lstrip_blocks=True)
# Get service role from environment to determine what to generate
service_role = os.getenv('SERVICE_ROLE', 'all-in-one')
logging.info(f"Service role for generation: '{service_role}'")
# --- Camoufox Configuration (only for worker/all-in-one roles) ---
camoufox_proxies = []
expanded_camoufox_proxies_str = ""
if service_role != 'management':
logging.info("--- Generating Camoufox (Remote Browser) Configuration ---")
camoufox_proxies_str = os.getenv('CAMOUFOX_PROXIES')
if not camoufox_proxies_str:
logging.warning("CAMOUFOX_PROXIES environment variable not set. No camoufox instances will be generated.")
else:
# Expand environment variables within the string before splitting
expanded_camoufox_proxies_str = expand_env_vars(camoufox_proxies_str)
logging.info(f"Expanded CAMOUFOX_PROXIES from '{camoufox_proxies_str}' to '{expanded_camoufox_proxies_str}'")
camoufox_proxies = [{'url': p.strip()} for p in expanded_camoufox_proxies_str.split(',') if p.strip()]
logging.info(f"Found {len(camoufox_proxies)} proxy/proxies for Camoufox.")
logging.info(f"Each Camoufox instance will support {worker_count} concurrent browser sessions.")
logging.info(f"Total browser sessions supported on this worker: {len(camoufox_proxies) * worker_count}")
vnc_password = os.getenv('VNC_PASSWORD', 'supersecret')
base_vnc_port = int(os.getenv('CAMOUFOX_BASE_VNC_PORT', 5901))
camoufox_port = int(os.getenv('CAMOUFOX_PORT', 12345))
camoufox_backend_prefix = os.getenv('CAMOUFOX_BACKEND_PREFIX', 'camoufox-')
# --- Generate docker-compose.camoufox.yaml ---
compose_output_file = os.path.join(script_dir, 'docker-compose.camoufox.yaml')
# Generate the compose file directly without template
with open(compose_output_file, 'w') as f:
f.write("# THIS FILE IS AUTO-GENERATED BY generate_envoy_config.py\n")
f.write("# DO NOT EDIT MANUALLY.\n")
f.write("#\n")
f.write("# It contains the service definitions for the camoufox instances\n")
f.write("# and adds the necessary dependencies to the main services.\n")
f.write("services:\n\n")
# Generate services for each proxy
for i, proxy in enumerate(camoufox_proxies):
service_name = f"camoufox-{i+1}"
# Each container gets its own unique range of ports to avoid conflicts
container_base_port = camoufox_port + i * worker_count
host_base_port = container_base_port
f.write(f" {service_name}:\n")
f.write(f" build:\n")
f.write(f" context: ./camoufox\n")
f.write(f" dockerfile: Dockerfile\n")
f.write(f" args:\n")
f.write(f" VNC_PASSWORD: {vnc_password}\n")
f.write(f" image: camoufox:latest\n")
f.write(f" container_name: ytdlp-ops-{service_name}-1\n")
f.write(f" restart: unless-stopped\n")
f.write(f" shm_size: '2gb' # Mitigates browser crashes due to shared memory limitations\n")
f.write(f" ports:\n")
f.write(f" - \"{host_base_port}-{host_base_port + worker_count - 1}:{container_base_port}-{container_base_port + worker_count - 1}\"\n")
f.write(f" environment:\n")
f.write(f" - DISPLAY=:99\n")
f.write(f" - MOZ_HEADLESS_STACKSIZE=2097152\n")
f.write(f" - CAMOUFOX_MAX_MEMORY_MB=2048\n")
f.write(f" - CAMOUFOX_MAX_CONCURRENT_CONTEXTS=8\n")
f.write(f" - CAMOUFOX_RESTART_THRESHOLD_MB=1500\n")
f.write(f" volumes:\n")
f.write(f" - /tmp/.X11-unix:/tmp/.X11-unix:rw\n")
f.write(f" - camoufox-data-{i+1}:/app/context-data\n")
f.write(f" - camoufox-browser-cache:/root/.cache/ms-playwright # Persist browser binaries\n")
f.write(f" command: [\n")
f.write(f" \"--ws-host\", \"0.0.0.0\",\n")
f.write(f" \"--port\", \"{container_base_port}\",\n")
f.write(f" \"--num-instances\", \"{worker_count}\",\n")
f.write(f" \"--ws-path\", \"mypath\",\n")
f.write(f" \"--proxy-url\", \"{proxy['url']}\",\n")
f.write(f" \"--headless\",\n")
f.write(f" \"--monitor-resources\",\n")
f.write(f" \"--memory-restart-threshold\", \"1800\",\n")
f.write(f" \"--preferences\", \"layers.acceleration.disabled=true,dom.ipc.processCount=2,media.memory_cache_max_size=102400,browser.cache.memory.capacity=102400\"\n")
f.write(f" ]\n")
f.write(f" deploy:\n")
f.write(f" resources:\n")
f.write(f" limits:\n")
f.write(f" memory: 2.5G\n")
f.write(f" logging:\n")
f.write(f" driver: \"json-file\"\n")
f.write(f" options:\n")
f.write(f" max-size: \"100m\"\n")
f.write(f" max-file: \"3\"\n")
f.write(f" networks:\n")
f.write(f" - proxynet\n\n")
# Add camoufox-group service that depends on all camoufox instances
if camoufox_proxies:
f.write(" camoufox-group:\n")
f.write(" image: alpine:latest\n")
f.write(" command: [\"echo\", \"Camoufox group ready.\"]\n")
f.write(" restart: \"no\"\n")
f.write(" depends_on:\n")
for i in range(len(camoufox_proxies)):
f.write(f" - camoufox-{i+1}\n")
f.write(" networks:\n")
f.write(" - proxynet\n\n")
# Write volumes section
f.write("volumes:\n")
for i in range(len(camoufox_proxies)):
f.write(f" camoufox-data-{i+1}:\n")
if camoufox_proxies:
f.write(" camoufox-browser-cache:\n")
f.write("\n")
# Write networks section
f.write("networks:\n")
f.write(" proxynet:\n")
f.write(" name: airflow_proxynet\n")
f.write(" external: true\n")
logging.info(f"Successfully generated {compose_output_file} with {len(camoufox_proxies)} camoufox service(s).")
logging.info("This docker-compose file defines the remote browser services, one for each proxy.")
logging.info("----------------------------------------------------------")
# --- Generate camoufox_endpoints.json ---
endpoints_map = {}
for i, proxy in enumerate(camoufox_proxies):
proxy_port = _get_port_from_proxy_url(proxy['url'])
if proxy_port:
container_base_port = camoufox_port + i * worker_count
endpoints = []
for j in range(worker_count):
port = container_base_port + j
endpoints.append(f"ws://{camoufox_backend_prefix}{i+1}:{port}/mypath")
endpoints_map[proxy_port] = {
"ws_endpoints": endpoints
}
else:
logging.warning(f"Could not extract port from proxy URL: {proxy['url']}. Skipping for endpoint map.")
endpoints_data = {"endpoints": endpoints_map}
camoufox_dir = os.path.join(script_dir, 'camoufox')
endpoints_output_file = os.path.join(camoufox_dir, 'camoufox_endpoints.json')
with open(endpoints_output_file, 'w') as f:
json.dump(endpoints_data, f, indent=2)
logging.info(f"Successfully generated {endpoints_output_file} with {len(endpoints_map)} port-keyed endpoint(s).")
logging.info("This file maps each proxy to a list of WebSocket endpoints for Camoufox.")
logging.info("The token_generator uses this map to connect to the correct remote browser.")
else:
logging.info("Skipping Camoufox configuration generation for 'management' role.")
# --- Generate docker-compose-ytdlp-ops.yaml ---
ytdlp_ops_template = env.get_template('docker-compose-ytdlp-ops.yaml.j2')
ytdlp_ops_output_file = os.path.join(script_dir, 'docker-compose-ytdlp-ops.yaml')
# Combine all proxies (camoufox and general) into a single string for the server.
all_proxies = []
if expanded_camoufox_proxies_str:
all_proxies.extend([p.strip() for p in expanded_camoufox_proxies_str.split(',') if p.strip()])
general_proxies_str = os.getenv('GENERAL_PROXIES')
if general_proxies_str:
expanded_general_proxies_str = expand_env_vars(general_proxies_str)
logging.info(f"Expanded GENERAL_PROXIES from '{general_proxies_str}' to '{expanded_general_proxies_str}'")
general_proxies = [p.strip() for p in expanded_general_proxies_str.split(',') if p.strip()]
all_proxies.extend(general_proxies)
logging.info(f"Adding {len(general_proxies)} general purpose proxy/proxies.")
# Also check for the SOCKS5_SOCK_SERVER_IP for backward compatibility with docs
socks_server_ip = os.getenv('SOCKS5_SOCK_SERVER_IP', '172.17.0.1')
if socks_server_ip:
socks_server_port = os.getenv('SOCKS5_SOCK_SERVER_PORT', '1087')
general_proxy_url = f"socks5://{socks_server_ip}:{socks_server_port}"
if general_proxy_url not in all_proxies:
all_proxies.append(general_proxy_url)
logging.info(f"Adding general purpose proxy from SOCKS5_SOCK_SERVER_IP: {general_proxy_url}")
combined_proxies_str = ",".join(all_proxies)
logging.info(f"Combined proxy string for ytdlp-ops-service: '{combined_proxies_str}'")
ytdlp_ops_config_data = {
'combined_proxies_str': combined_proxies_str,
'service_role': service_role,
}
rendered_ytdlp_ops_config = ytdlp_ops_template.render(ytdlp_ops_config_data)
with open(ytdlp_ops_output_file, 'w') as f:
f.write(rendered_ytdlp_ops_config)
logging.info(f"Successfully generated {ytdlp_ops_output_file}")
# --- Envoy Configuration ---
envoy_port = int(os.getenv('ENVOY_PORT', 9080))
base_port = int(os.getenv('YTDLP_BASE_PORT', 9090))
envoy_admin_port = int(os.getenv('ENVOY_ADMIN_PORT', 9901))
# For local dev, ENVOY_BACKEND_ADDRESS is set to 127.0.0.1. For Docker, it's unset, so we default to the service name.
backend_address = os.getenv('ENVOY_BACKEND_ADDRESS', 'ytdlp-ops-service')
# Use STATIC for IP addresses, and STRICT_DNS for anything else (hostnames).
envoy_cluster_type = 'STATIC' if is_ip_address(backend_address) else 'STRICT_DNS'
# --- Generate envoy.yaml ---
envoy_template = env.get_template('envoy.yaml.j2')
envoy_output_file = os.path.join(script_dir, 'envoy.yaml')
logging.info("--- Generating Envoy Configuration ---")
logging.info(f"Envoy will listen on public port: {envoy_port}")
logging.info(f"It will load balance requests across {worker_count} internal gRPC endpoints of the 'ytdlp-ops-service'.")
logging.info(f"The backend service is located at: '{backend_address}' (type: {envoy_cluster_type})")
envoy_config_data = {
'envoy_port': envoy_port,
'worker_count': worker_count,
'base_port': base_port,
'envoy_admin_port': envoy_admin_port,
'backend_address': backend_address,
'envoy_cluster_type': envoy_cluster_type,
}
rendered_envoy_config = envoy_template.render(envoy_config_data)
with open(envoy_output_file, 'w') as f:
f.write(rendered_envoy_config)
logging.info(f"Successfully generated {envoy_output_file}")
logging.info("--- Configuration Generation Complete ---")
except Exception as e:
logging.error(f"Failed to generate configurations: {e}", exc_info=True)
exit(1)
if __name__ == '__main__':
generate_configs()

102
airflow/init-airflow.sh Executable file
View File

@ -0,0 +1,102 @@
#!/bin/bash
#
# This script should be run on the Airflow host (master or worker)
# to initialize the environment. It creates the .env file and sets
# up permissions.
#
set -e
# --- Configuration ---
# The directory where docker-compose.yaml is located
AIRFLOW_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$AIRFLOW_DIR"
echo "--- Initializing Airflow Environment in $AIRFLOW_DIR ---"
# --- Step 1: Create or update .env file for Docker permissions ---
if [ -f ".env" ]; then
echo ".env file already exists. Ensuring correct permissions are set..."
# Ensure AIRFLOW_UID is set to the current user's ID.
if ! grep -q "^AIRFLOW_UID=" .env; then
echo "AIRFLOW_UID not found in .env. Appending..."
echo "AIRFLOW_UID=$(id -u)" >> .env
fi
# Ensure HOSTNAME is set for worker identity.
if ! grep -q "^HOSTNAME=" .env; then
echo "HOSTNAME not found in .env. Appending..."
echo "HOSTNAME=$(hostname)" >> .env
fi
# Force AIRFLOW_GID to be 0, as required by the Airflow image.
# This removes any existing AIRFLOW_GID line and adds the correct one.
if grep -q "^AIRFLOW_GID=" .env; then
echo "Found existing AIRFLOW_GID. Forcing it to 0..."
# The sed command works on both Linux and macOS, creating a .env.bak file.
sed -i.bak '/^AIRFLOW_GID=/d' .env
fi
echo "AIRFLOW_GID=0" >> .env
echo "Permissions updated in .env file."
else
echo "Creating .env file..."
# Note: On Linux hosts, this is crucial for permissions.
echo "AIRFLOW_UID=$(id -u)" > .env
echo "AIRFLOW_GID=0" >> .env
# Add HOSTNAME for worker-specific queueing and container identity
echo "HOSTNAME=$(hostname)" >> .env
# Add default passwords. These should be changed for production.
echo "POSTGRES_PASSWORD=pgdb_pwd_A7bC2xY9zE1wV5uP" >> .env
echo "REDIS_PASSWORD=redis_pwd_K3fG8hJ1mN5pQ2sT" >> .env
echo "AIRFLOW_ADMIN_PASSWORD=admin_pwd_X9yZ3aB1cE5dF7gH" >> .env
echo ".env file created. For a DL worker, you must also add MASTER_HOST_IP. Please review and update passwords."
fi
echo "Current .env contents:"
cat .env
echo "----------------------------------------"
# --- Step 2: Create directories and set permissions ---
# These directories are mounted into the containers and need to exist on the host.
echo "Ensuring mounted directories exist..."
# Define directories in an array for reuse
DIRS_TO_CREATE=(dags logs plugins config inputfiles downloadfiles addfiles)
mkdir -p "${DIRS_TO_CREATE[@]}"
echo "Directories checked/created."
# Load .env to get AIRFLOW_UID. The `set -o allexport` command exports all variables defined from now on.
if [ -f .env ]; then
set -o allexport
source .env
set +o allexport
else
echo "ERROR: .env file not found. Cannot determine AIRFLOW_UID for setting permissions."
exit 1
fi
# Set permissions on the directories. This is crucial for the Airflow user inside the container.
# The airflow-init container on the master does this, but for workers, we must do it here.
echo "Setting ownership for mounted directories to AIRFLOW_UID=${AIRFLOW_UID}..."
if command -v sudo &> /dev/null; then
sudo chown -R "${AIRFLOW_UID}:0" "${DIRS_TO_CREATE[@]}"
echo "Permissions set successfully."
else
echo "WARNING: 'sudo' command not found. Attempting 'chown' as current user."
chown -R "${AIRFLOW_UID}:0" "${DIRS_TO_CREATE[@]}" || (
echo "ERROR: Failed to set permissions. Please run the following command manually with appropriate privileges:"
echo "chown -R \"${AIRFLOW_UID}:0\" dags logs plugins config inputfiles downloadfiles addfiles"
exit 1
)
echo "Permissions set successfully."
fi
echo "----------------------------------------"
# --- Step 3: Instructions for creating admin user ---
echo "--- Next Steps ---"
echo "1. Ensure your docker-compose.yaml (and -master.yaml, -dl.yaml) files are present."
echo "2. Start Airflow services: docker compose up -d"
echo "3. The admin user will be created automatically with the password from your .env file."
echo " Default username: admin"
echo " Default password can be found in .env as AIRFLOW_ADMIN_PASSWORD"
echo
echo "Initialization complete."

View File

@ -0,0 +1,32 @@
#!/bin/bash
#
# This script should be run on the YT Service host to initialize the environment.
# It creates the .env file from the example if it doesn't exist.
#
set -e
# --- Configuration ---
# The directory where docker-compose-ytdlp-ops.yaml is located
SERVICE_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SERVICE_DIR"
echo "--- Initializing YT Service Environment in $SERVICE_DIR ---"
# --- Step 1: Create .env file from .env.example ---
if [ -f ".env" ]; then
echo ".env file already exists. Skipping creation."
else
if [ -f ".env.example" ]; then
echo "Creating .env file from .env.example..."
cp .env.example .env
echo ".env file created. IMPORTANT: Please edit it with your production values."
else
echo "Warning: .env.example not found. Cannot create .env file."
echo "Please create a .env file manually."
fi
fi
echo "----------------------------------------"
echo "Initialization check complete."
echo "Please review the .env file and then follow the 'Next Steps' from the deployment script."

View File

@ -0,0 +1,130 @@
[
"https://www.youtube.com/watch?v=EH81MQiDyFs",
"https://www.youtube.com/watch?v=YwC2VtRFBPs",
"https://www.youtube.com/watch?v=keSo7x42Xis",
"https://www.youtube.com/watch?v=K6OlxDi1cws",
"https://www.youtube.com/watch?v=eIYjjvR_k6w",
"https://www.youtube.com/watch?v=CprKmvtw-TE",
"https://www.youtube.com/watch?v=4vB1bDJ8dvA",
"https://www.youtube.com/watch?v=kJcvr693bjI",
"https://www.youtube.com/watch?v=NPQz5Hn6XKM",
"https://www.youtube.com/watch?v=DCo-7dCw2OY",
"https://www.youtube.com/watch?v=Q0996ndUMxU",
"https://www.youtube.com/watch?v=IxbFckR3yIc",
"https://www.youtube.com/watch?v=xt5QQgEqVzs",
"https://www.youtube.com/watch?v=L9pzC26i3BU",
"https://www.youtube.com/watch?v=YlkzSAqV0jE",
"https://www.youtube.com/watch?v=v9ZxQw3NQA8",
"https://www.youtube.com/watch?v=EB_eBvRsGqM",
"https://www.youtube.com/watch?v=xJ4PHYU3oY4",
"https://www.youtube.com/watch?v=kHf-eCb7q2I",
"https://www.youtube.com/watch?v=q3hNcqo5qdY",
"https://www.youtube.com/watch?v=097ujVv38LU",
"https://www.youtube.com/watch?v=VYnzo8xa_dw",
"https://www.youtube.com/watch?v=2y690c69yb4",
"https://www.youtube.com/watch?v=R_JiPanFbEs",
"https://www.youtube.com/watch?v=_VF9sk-IjOE",
"https://www.youtube.com/watch?v=01yS1dPQsZc",
"https://www.youtube.com/watch?v=0xW7slvHwiU",
"https://www.youtube.com/watch?v=qeeC7i5HTpU",
"https://www.youtube.com/watch?v=McvQBwZ_MfY",
"https://www.youtube.com/watch?v=ssQ456jGiKs",
"https://www.youtube.com/watch?v=Xz84juOdgVY",
"https://www.youtube.com/watch?v=6jw_rFi75YA",
"https://www.youtube.com/watch?v=XVtwjyQESLI",
"https://www.youtube.com/watch?v=GCuRuMZG2CU",
"https://www.youtube.com/watch?v=SLGT3nSHjKY",
"https://www.youtube.com/watch?v=KfXZckcDnwc",
"https://www.youtube.com/watch?v=krlijOR_314",
"https://www.youtube.com/watch?v=c5TIIXZTWYU",
"https://www.youtube.com/watch?v=xbFlak2wDPU",
"https://www.youtube.com/watch?v=ESiCVT43y4M",
"https://www.youtube.com/watch?v=9K-8HK9NGPo",
"https://www.youtube.com/watch?v=AXfq7U9EHHY",
"https://www.youtube.com/watch?v=oWGeLLFTwhk",
"https://www.youtube.com/watch?v=dGTid_QDq3M",
"https://www.youtube.com/watch?v=s2GdkHY7e74",
"https://www.youtube.com/watch?v=EYRnywNSHfM",
"https://www.youtube.com/watch?v=8QcanJptlFs",
"https://www.youtube.com/watch?v=8_B0MrjTDqw",
"https://www.youtube.com/watch?v=2LealZ7TTlY",
"https://www.youtube.com/watch?v=dtBosQzUqDs",
"https://www.youtube.com/watch?v=PuQwOWigWVA",
"https://www.youtube.com/watch?v=LOlVXM27ap8",
"https://www.youtube.com/watch?v=JtgKbx6nm7I",
"https://www.youtube.com/watch?v=owFxod3Pe70",
"https://www.youtube.com/watch?v=dmBpn2ZjNW4",
"https://www.youtube.com/watch?v=7Do8GAKRFsw",
"https://www.youtube.com/watch?v=7oysSz1unf0",
"https://www.youtube.com/watch?v=Z4Wn7qrR0nU",
"https://www.youtube.com/watch?v=wvgwnY0x6wo",
"https://www.youtube.com/watch?v=qUGZg985hqA",
"https://www.youtube.com/watch?v=pWvyocl7dhI",
"https://www.youtube.com/watch?v=BMzSz3aiBFU",
"https://www.youtube.com/watch?v=mgOGXUctR8U",
"https://www.youtube.com/watch?v=1rIhg0Z-Ylo",
"https://www.youtube.com/watch?v=K4hj2aQ8vCM",
"https://www.youtube.com/watch?v=jzMt0J7eohg",
"https://www.youtube.com/watch?v=LeYfSHB1zZw",
"https://www.youtube.com/watch?v=hBS3QbVFHQk",
"https://www.youtube.com/watch?v=2mBdZZm8Syo",
"https://www.youtube.com/watch?v=zaZE_AHeRIc",
"https://www.youtube.com/watch?v=DBod4x5OZsM",
"https://www.youtube.com/watch?v=lNYnMLhMMNc",
"https://www.youtube.com/watch?v=Feo_5sWRjY0",
"https://www.youtube.com/watch?v=tYWLm75nibA",
"https://www.youtube.com/watch?v=xx1HYybZDH0",
"https://www.youtube.com/watch?v=EyIY0BKYIrA",
"https://www.youtube.com/watch?v=BfAoe4GbKt4",
"https://www.youtube.com/watch?v=qmizxZdHB7A",
"https://www.youtube.com/watch?v=7K73KytWJR4",
"https://www.youtube.com/watch?v=hPyi-EnO_Dw",
"https://www.youtube.com/watch?v=M4Gp7eMj2IQ",
"https://www.youtube.com/watch?v=rPOOnshXEOk",
"https://www.youtube.com/watch?v=fmOB4FNj4MM",
"https://www.youtube.com/watch?v=UgwjPBJ-iyA",
"https://www.youtube.com/watch?v=tInqj66fkxc",
"https://www.youtube.com/watch?v=tok-jMC1V0E",
"https://www.youtube.com/watch?v=2IuaROF1pMs",
"https://www.youtube.com/watch?v=Ak5JpqBA5No",
"https://www.youtube.com/watch?v=A_yH2vzq7CY",
"https://www.youtube.com/watch?v=4nzsI5fxdlA",
"https://www.youtube.com/watch?v=1FfwsJInFOM",
"https://www.youtube.com/watch?v=uRjJbkgf_3I",
"https://www.youtube.com/watch?v=HMjduefTG4E",
"https://www.youtube.com/watch?v=Cw9hUSFppnw",
"https://www.youtube.com/watch?v=vrobF1L3BJ8",
"https://www.youtube.com/watch?v=tIiVUsKPCEY",
"https://www.youtube.com/watch?v=7qprIRCTX6A",
"https://www.youtube.com/watch?v=HREKaNF7TT8",
"https://www.youtube.com/watch?v=xlIgqZ1sW5A",
"https://www.youtube.com/watch?v=6_uA0osze4w",
"https://www.youtube.com/watch?v=jarbK6tvflw",
"https://www.youtube.com/watch?v=RWmeSE312FA",
"https://www.youtube.com/watch?v=hhI7lAonIrU",
"https://www.youtube.com/watch?v=4k23-uYPObU",
"https://www.youtube.com/watch?v=rIxiOD0dA3w",
"https://www.youtube.com/watch?v=Ry-_mpn3Pe8",
"https://www.youtube.com/watch?v=m-H4fOb1o2Q",
"https://www.youtube.com/watch?v=NhGxI_tgSwI",
"https://www.youtube.com/watch?v=VTslivtVfAI",
"https://www.youtube.com/watch?v=huSCDYe04Fk",
"https://www.youtube.com/watch?v=LF82qA5a05E",
"https://www.youtube.com/watch?v=kHaHsbFg28M",
"https://www.youtube.com/watch?v=NKDFri_kL94",
"https://www.youtube.com/watch?v=BPIlpDQwWqA",
"https://www.youtube.com/watch?v=UTCAshkc8qk",
"https://www.youtube.com/watch?v=EkUtGGKaX_I",
"https://www.youtube.com/watch?v=tuLyfqdpYxU",
"https://www.youtube.com/watch?v=snxBL-8IGCA",
"https://www.youtube.com/watch?v=Mo9m8EdR8_Y",
"https://www.youtube.com/watch?v=5nBipdnGAbU",
"https://www.youtube.com/watch?v=sLs6vp5TH_w",
"https://www.youtube.com/watch?v=OYM5PrQtT34",
"https://www.youtube.com/watch?v=FX3wjgGWn1s",
"https://www.youtube.com/watch?v=1FfwsJInFOM",
"https://www.youtube.com/watch?v=osWMBc6h5Rs",
"https://www.youtube.com/watch?v=aojc0sLBm5Y",
"https://www.youtube.com/watch?v=akf_6pAx024",
"https://www.youtube.com/watch?v=SgSkvKpAxMQ"
]

View File

@ -0,0 +1,101 @@
[
"https://www.youtube.com/watch?v=Y0WQdA4srb0",
"https://www.youtube.com/watch?v=uFyraEVj848",
"https://www.youtube.com/watch?v=VxPx0Qjgbos",
"https://www.youtube.com/watch?v=FuKOn-_rfeE",
"https://www.youtube.com/watch?v=mn9t5eOs30c",
"https://www.youtube.com/watch?v=7YOE0GEUrVo",
"https://www.youtube.com/watch?v=4L8kv6qVTfY",
"https://www.youtube.com/watch?v=7WSEWOft4Y4",
"https://www.youtube.com/watch?v=bmDsn0_1-f0",
"https://www.youtube.com/watch?v=IILtHOqYndA",
"https://www.youtube.com/watch?v=tyGqbWBjSWE",
"https://www.youtube.com/watch?v=3tgZTpkZQkQ",
"https://www.youtube.com/watch?v=JJH-CkjiQWI",
"https://www.youtube.com/watch?v=4hLWn4hHKNM",
"https://www.youtube.com/watch?v=IFwr6QGxoJo",
"https://www.youtube.com/watch?v=Fj-NKUoMbmI",
"https://www.youtube.com/watch?v=zvoxV3wLjFE",
"https://www.youtube.com/watch?v=EcC4CIyUI2Q",
"https://www.youtube.com/watch?v=jtjiTuTKCT4",
"https://www.youtube.com/watch?v=am28qDtXLLU",
"https://www.youtube.com/watch?v=WNVW86YBkMg",
"https://www.youtube.com/watch?v=kG51upknRCw",
"https://www.youtube.com/watch?v=E-HpdWghf2U",
"https://www.youtube.com/watch?v=GuaAOc9ZssE",
"https://www.youtube.com/watch?v=r1JkW0zfPOA",
"https://www.youtube.com/watch?v=OBYmpN8uAag",
"https://www.youtube.com/watch?v=0HuGAMKHXD4",
"https://www.youtube.com/watch?v=eDmdalDaPdU",
"https://www.youtube.com/watch?v=ZjDR1XMd904",
"https://www.youtube.com/watch?v=HGrsrP4idE8",
"https://www.youtube.com/watch?v=l-J_J7YFDYY",
"https://www.youtube.com/watch?v=Kr5rl0935K4",
"https://www.youtube.com/watch?v=KgK4bu9O384",
"https://www.youtube.com/watch?v=BDq3_y4mXYo",
"https://www.youtube.com/watch?v=slRiaDz12m8",
"https://www.youtube.com/watch?v=iX1oWEsHh0A",
"https://www.youtube.com/watch?v=0zJcsxB6-UU",
"https://www.youtube.com/watch?v=NTOokrCHzJA",
"https://www.youtube.com/watch?v=CXYXqQ-VuYo",
"https://www.youtube.com/watch?v=xaxZtPTEraU",
"https://www.youtube.com/watch?v=wX1wNCPZdE8",
"https://www.youtube.com/watch?v=DOt7ckIGN4Y",
"https://www.youtube.com/watch?v=bncasw-Z4Ow",
"https://www.youtube.com/watch?v=nbVWfXlo7kQ",
"https://www.youtube.com/watch?v=Uu6DmhonkEE",
"https://www.youtube.com/watch?v=HGWigeoSMvA",
"https://www.youtube.com/watch?v=rjbLCaC9yFE",
"https://www.youtube.com/watch?v=Uew7f09gW4o",
"https://www.youtube.com/watch?v=uzc-jLt65mY",
"https://www.youtube.com/watch?v=ZX7qnLuAsMU",
"https://www.youtube.com/watch?v=ZlSgDvCP5UI",
"https://www.youtube.com/watch?v=RmGIid7Yctw",
"https://www.youtube.com/watch?v=u9g0_eR5gEk",
"https://www.youtube.com/watch?v=wu9Cw905NUU",
"https://www.youtube.com/watch?v=cNhQVoY5V5Q",
"https://www.youtube.com/watch?v=I63iJNKOb8I",
"https://www.youtube.com/watch?v=3G5ceoSK6jg",
"https://www.youtube.com/watch?v=JF4TbV940PM",
"https://www.youtube.com/watch?v=0yGaVHfmGa0",
"https://www.youtube.com/watch?v=r8cgtI_ZQIY",
"https://www.youtube.com/watch?v=OcG3-r98XEM",
"https://www.youtube.com/watch?v=w7hooOUEMQI",
"https://www.youtube.com/watch?v=yipW8SF5Gxk",
"https://www.youtube.com/watch?v=LH4PqRiuxts",
"https://www.youtube.com/watch?v=IfAsA3ezUqQ",
"https://www.youtube.com/watch?v=5cUg8I0yps4",
"https://www.youtube.com/watch?v=lCea6bQj3eg",
"https://www.youtube.com/watch?v=5Ie0MAv4XCY",
"https://www.youtube.com/watch?v=57eomGPy1PU",
"https://www.youtube.com/watch?v=TEnk3OfU8Gc",
"https://www.youtube.com/watch?v=1uA4xXlDhvE",
"https://www.youtube.com/watch?v=aXF8ijpn4bM",
"https://www.youtube.com/watch?v=3vKmCDomyJ8",
"https://www.youtube.com/watch?v=z7jLEWJ59uY",
"https://www.youtube.com/watch?v=0TTsKnyH6EY",
"https://www.youtube.com/watch?v=PcqA6Y1RfVQ",
"https://www.youtube.com/watch?v=f1Ar3ydryqc",
"https://www.youtube.com/watch?v=N2nLayOIjxM",
"https://www.youtube.com/watch?v=Cziyx9qaYVM",
"https://www.youtube.com/watch?v=RTJCbIJ294w",
"https://www.youtube.com/watch?v=GC1FB-bZTvA",
"https://www.youtube.com/watch?v=kKYv5uLBSFk",
"https://www.youtube.com/watch?v=jfQHlnNeKzw",
"https://www.youtube.com/watch?v=J7e8PRu9kSU",
"https://www.youtube.com/watch?v=UoHf6pdy0oE",
"https://www.youtube.com/watch?v=JOwNcwSupXs",
"https://www.youtube.com/watch?v=gxwk-bb78-U",
"https://www.youtube.com/watch?v=_lrDwiK544A",
"https://www.youtube.com/watch?v=6i8BVQ9GE1g",
"https://www.youtube.com/watch?v=8c_l9D1qyKY",
"https://www.youtube.com/watch?v=KFCr5BdjFB8",
"https://www.youtube.com/watch?v=orEvHn7lL4A",
"https://www.youtube.com/watch?v=6BhGJxrp8P4",
"https://www.youtube.com/watch?v=n2t8beFnhyA",
"https://www.youtube.com/watch?v=GJzZ2-f_k30",
"https://www.youtube.com/watch?v=oId850O591s",
"https://www.youtube.com/watch?v=f2XmdQdwppw",
"https://www.youtube.com/watch?v=iWM_oe-JY_k",
"https://www.youtube.com/watch?v=GHEDWE9LjRY"
]

View File

@ -0,0 +1,30 @@
[
"https://www.youtube.com/watch?v=lKrVuufVMXA",
"https://www.youtube.com/watch?v=ISqDcqGdow0",
"https://www.youtube.com/watch?v=srG-WnQdZq8",
"https://www.youtube.com/watch?v=HP-KB6XFqgs",
"https://www.youtube.com/watch?v=1e13SIh51wk",
"https://www.youtube.com/watch?v=VTKG48FjSxs",
"https://www.youtube.com/watch?v=onEWAyPRm6E",
"https://www.youtube.com/watch?v=7RdrGwpZzMo",
"https://www.youtube.com/watch?v=M5uu93_AhXg",
"https://www.youtube.com/watch?v=xnkvCBfTfok",
"https://www.youtube.com/watch?v=oE9hGZyFN8E",
"https://www.youtube.com/watch?v=7LofBMRP6U4",
"https://www.youtube.com/watch?v=EDE8tyroJEE",
"https://www.youtube.com/watch?v=oLwsWGi0sUc",
"https://www.youtube.com/watch?v=a6dvhHPyFIw",
"https://www.youtube.com/watch?v=4jds773UlWE",
"https://www.youtube.com/watch?v=B6dXxqiSBSM",
"https://www.youtube.com/watch?v=9EbS6w3RSG0",
"https://www.youtube.com/watch?v=LyKONGzUANU",
"https://www.youtube.com/watch?v=sGW5kfpR6Wo",
"https://www.youtube.com/watch?v=pa4-JninkUQ",
"https://www.youtube.com/watch?v=DxXMFBWarjY",
"https://www.youtube.com/watch?v=PYQjfpCEWvc",
"https://www.youtube.com/watch?v=_jlNCjI9jiQ",
"https://www.youtube.com/watch?v=BxEC11QS3sQ",
"https://www.youtube.com/watch?v=6-qbWRzVbGA",
"https://www.youtube.com/watch?v=p3lCQvZBv_k",
"https://www.youtube.com/watch?v=67YA1CHpGrM"
]

View File

@ -0,0 +1,5 @@
[
"https://www.youtube.com/watch?v=uxiLE2Kv7wc",
"https://www.youtube.com/watch?v=Q7R0epGFnRI",
"https://www.youtube.com/watch?v=4mEmsJXKroE"
]

View File

@ -0,0 +1,48 @@
[
"https://www.youtube.com/watch?v=l700b4BpFAA",
"https://www.youtube.com/watch?v=G_JAVwwWyUM",
"https://www.youtube.com/watch?v=2LGz9nUw-XI",
"https://www.youtube.com/watch?v=7dK6a8LWAWw",
"https://www.youtube.com/watch?v=lKSZnZggcto",
"https://www.youtube.com/watch?v=Zy0ZFAMqm7U",
"https://www.youtube.com/watch?v=7UunWMHBrEE",
"https://www.youtube.com/watch?v=LPdbLCX3N-4",
"https://www.youtube.com/watch?v=-lJ5DVbkVw4",
"https://www.youtube.com/watch?v=QrRRS0RzELs",
"https://www.youtube.com/watch?v=XSty74mE1iE",
"https://www.youtube.com/watch?v=orijdeDOk5g",
"https://www.youtube.com/watch?v=27YVRo9VUE8",
"https://www.youtube.com/watch?v=p-JNgLI_8nA",
"https://www.youtube.com/watch?v=gkekjIJB_Nw",
"https://www.youtube.com/watch?v=V8QFCgOfkgw",
"https://www.youtube.com/watch?v=_GVVEsxZ_Mo",
"https://www.youtube.com/watch?v=7_zMqxK4gZE",
"https://www.youtube.com/watch?v=cwuJCb316yQ",
"https://www.youtube.com/watch?v=TIGxtvVVHak",
"https://www.youtube.com/watch?v=KhcicW2keWY",
"https://www.youtube.com/watch?v=miUJ85pFCPE",
"https://www.youtube.com/watch?v=97L4qVfSwv4",
"https://www.youtube.com/watch?v=Wk38hWQfz24",
"https://www.youtube.com/watch?v=iIU-NVWkTDE",
"https://www.youtube.com/watch?v=l89VaRof8ug",
"https://www.youtube.com/watch?v=IIkjS5MpQVM",
"https://www.youtube.com/watch?v=9XxPGKkOs0o",
"https://www.youtube.com/watch?v=_dlpve9GPZM",
"https://www.youtube.com/watch?v=He_3MjAuZNQ",
"https://www.youtube.com/watch?v=FnPEHn2NHT4",
"https://www.youtube.com/watch?v=HuSjI7HFkzo",
"https://www.youtube.com/watch?v=pBZSgVJHacs",
"https://www.youtube.com/watch?v=OgsG082zDGo",
"https://www.youtube.com/watch?v=_4sxhmPsryY",
"https://www.youtube.com/watch?v=kqU6B5rIEnI",
"https://www.youtube.com/watch?v=BEYn_ILHmBE",
"https://www.youtube.com/watch?v=qy9Zr3HV9V4",
"https://www.youtube.com/watch?v=7I1VvJZbG-M",
"https://www.youtube.com/watch?v=WOa-HA3MoVQ",
"https://www.youtube.com/watch?v=uaHI-WHwivc",
"https://www.youtube.com/watch?v=9ku8r8uZ9EQ",
"https://www.youtube.com/watch?v=XAyaDcLxwHQ",
"https://www.youtube.com/watch?v=zpc-hJGSNBc",
"https://www.youtube.com/watch?v=AGbG62y1DyE",
"https://www.youtube.com/watch?v=7rmyabL60oA"
]

27
airflow/nginx.conf Normal file
View File

@ -0,0 +1,27 @@
events {
worker_connections 1024;
}
stream {
upstream minio_servers {
server minio1:9000;
server minio2:9000;
server minio3:9000;
}
upstream minio_console_servers {
server minio1:9001;
server minio2:9001;
server minio3:9001;
}
server {
listen 9000;
proxy_pass minio_servers;
}
server {
listen 9001;
proxy_pass minio_console_servers;
}
}

View File

@ -0,0 +1,56 @@
from airflow.plugins_manager import AirflowPlugin
from airflow.hooks.base import BaseHook
from airflow.configuration import conf
import uuid
import backoff
class YTDLPHook(BaseHook):
def __init__(self, conn_id='ytdlp_default'):
super().__init__()
self.conn_id = conn_id
self.connection = self.get_connection(conn_id)
self.timeout = conf.getint('ytdlp', 'timeout', fallback=120)
self.max_retries = conf.getint('ytdlp', 'max_retries', fallback=3)
@backoff.on_exception(backoff.expo,
Exception,
max_tries=3,
max_time=300)
def start_service(self, host, port, service_id, work_dir):
"""Start token service as a long-running process"""
import subprocess
import os
from pathlib import Path
# Get script path relative to Airflow home
airflow_home = os.getenv('AIRFLOW_HOME', '')
script_path = Path(airflow_home).parent / 'ytdlp_ops_server.py'
# Ensure work directory exists
os.makedirs(work_dir, exist_ok=True)
# Start service process
cmd = [
'python', str(script_path),
'--port', str(port),
'--host', host,
'--service-id', service_id,
'--context-dir', work_dir,
'--script-dir', str(Path(airflow_home) / 'dags' / 'scripts')
]
self.log.info(f"Starting token service: {' '.join(cmd)}")
# Start process detached
docker_cmd = [
'docker-compose', '-f', 'docker-compose.yaml',
'up', '-d', '--build', 'ytdlp-service'
]
subprocess.run(docker_cmd, check=True)
self.log.info(f"Token service started on {host}:{port}")
return True
class YTDLPPlugin(AirflowPlugin):
name = 'ytdlp_plugin'
hooks = [YTDLPHook]

View File

@ -0,0 +1,124 @@
---
- name: Check if Airflow master deployment directory exists
stat:
path: "{{ airflow_master_dir }}"
register: master_dir_stat
- name: Ensure Airflow master deployment directory exists
file:
path: "{{ airflow_master_dir }}"
state: directory
owner: "{{ ssh_user }}"
group: ytdl
mode: '0755'
become: yes
when: not master_dir_stat.stat.exists
- name: Check if source directories exist
stat:
path: "{{ playbook_dir }}/../{{ item }}"
register: source_dirs
loop:
- "airflow/inputfiles"
- "airflow/plugins"
- "airflow/addfiles"
- "airflow/bgutil-ytdlp-pot-provider"
- name: Sync Airflow master files
synchronize:
src: "{{ playbook_dir }}/../{{ item }}"
dest: "{{ airflow_master_dir }}/"
archive: yes
recursive: yes
delete: yes
rsync_opts: "{{ rsync_default_opts }}"
loop:
- "airflow/Dockerfile"
- "airflow/docker-compose-master.yaml"
- "airflow/dags/"
- "airflow/config/"
- "setup.py"
- "yt_ops_services/"
- "thrift_model/"
- "VERSION"
- "airflow/init-airflow.sh"
- "airflow/nginx.conf"
- "get_info_json_client.py"
- "proxy_manager_client.py"
- name: Sync optional directories if they exist
synchronize:
src: "{{ playbook_dir }}/../{{ item }}/"
dest: "{{ airflow_master_dir }}/{{ item | basename }}/"
archive: yes
recursive: yes
delete: yes
rsync_opts: "{{ rsync_default_opts }}"
loop:
- "airflow/inputfiles"
- "airflow/plugins"
- "airflow/addfiles"
- "airflow/bgutil-ytdlp-pot-provider"
when: source_dirs.results | selectattr('item', 'equalto', item) | map(attribute='stat.exists') | first
- name: Sync pangramia thrift files
synchronize:
src: "{{ playbook_dir }}/../thrift_model/gen_py/pangramia/"
dest: "{{ airflow_master_dir }}/pangramia/"
archive: yes
recursive: yes
delete: yes
rsync_opts: "{{ rsync_default_opts }}"
- name: Create .env file for Airflow master service
template:
src: "../../templates/.env.master.j2"
dest: "{{ airflow_master_dir }}/.env"
mode: "{{ file_permissions }}"
owner: "{{ ssh_user }}"
group: ytdl
vars:
service_role: "master"
- name: Create symlink for docker-compose.yaml
file:
src: "{{ airflow_master_dir }}/docker-compose-master.yaml"
dest: "{{ airflow_master_dir }}/docker-compose.yaml"
state: link
owner: "{{ ssh_user }}"
group: ytdl
force: yes
- name: Verify Dockerfile exists in build directory
stat:
path: "{{ airflow_master_dir }}/Dockerfile"
register: dockerfile_stat
- name: Fail if Dockerfile is missing
fail:
msg: "Dockerfile not found in {{ airflow_master_dir }}. Cannot build image."
when: not dockerfile_stat.stat.exists
- name: Build Airflow master image
community.docker.docker_image:
name: "{{ airflow_image_name }}"
build:
path: "{{ airflow_master_dir }}"
dockerfile: "Dockerfile"
source: build
force_source: true
- name: Run Airflow init script
shell:
cmd: "chmod +x init-airflow.sh && ./init-airflow.sh"
chdir: "{{ airflow_master_dir }}"
become: yes
become_user: "{{ ssh_user }}"
- name: Start Airflow master service
community.docker.docker_compose_v2:
project_src: "{{ airflow_master_dir }}"
files:
- "docker-compose-master.yaml"
state: present
remove_orphans: true

View File

@ -0,0 +1,103 @@
---
- name: Check if Airflow worker deployment directory exists
stat:
path: "{{ airflow_worker_dir }}"
register: worker_dir_stat
- name: Ensure Airflow worker deployment directory exists
file:
path: "{{ airflow_worker_dir }}"
state: directory
owner: "{{ ssh_user }}"
group: ytdl
mode: '0755'
become: yes
when: not worker_dir_stat.stat.exists
- name: Sync Airflow worker files
synchronize:
src: "{{ playbook_dir }}/../{{ item }}"
dest: "{{ airflow_worker_dir }}/"
archive: yes
recursive: yes
delete: yes
rsync_opts: "{{ rsync_default_opts }}"
loop:
- "airflow/Dockerfile"
- "airflow/docker-compose-dl.yaml"
- "airflow/dags/"
- "airflow/config/"
- "setup.py"
- "yt_ops_services/"
- "thrift_model/"
- "VERSION"
- "airflow/init-airflow.sh"
- "get_info_json_client.py"
- "proxy_manager_client.py"
- "token_generator/"
- "utils/"
- name: Check if inputfiles directory exists
stat:
path: "{{ playbook_dir }}/../airflow/inputfiles"
register: inputfiles_stat
- name: Sync inputfiles directory if it exists
synchronize:
src: "{{ playbook_dir }}/../airflow/inputfiles/"
dest: "{{ airflow_worker_dir }}/inputfiles/"
archive: yes
recursive: yes
delete: yes
rsync_opts: "{{ rsync_default_opts }}"
when: inputfiles_stat.stat.exists
- name: Sync pangramia thrift files
synchronize:
src: "{{ playbook_dir }}/../thrift_model/gen_py/pangramia/"
dest: "{{ airflow_worker_dir }}/pangramia/"
archive: yes
recursive: yes
delete: yes
rsync_opts: "{{ rsync_default_opts }}"
- name: Create .env file for Airflow worker service
template:
src: "../../templates/.env.worker.j2"
dest: "{{ airflow_worker_dir }}/.env"
mode: "{{ file_permissions }}"
owner: "{{ ssh_user }}"
group: ytdl
vars:
service_role: "worker"
- name: Create symlink for docker-compose.yaml
file:
src: "{{ airflow_worker_dir }}/docker-compose-dl.yaml"
dest: "{{ airflow_worker_dir }}/docker-compose.yaml"
state: link
owner: "{{ ssh_user }}"
group: ytdl
- name: Build Airflow worker image
community.docker.docker_image:
name: "{{ airflow_image_name }}"
build:
path: "{{ airflow_worker_dir }}"
source: build
force_source: true
- name: Run Airflow init script
shell:
cmd: "chmod +x init-airflow.sh && ./init-airflow.sh"
chdir: "{{ airflow_worker_dir }}"
become: yes
become_user: "{{ ssh_user }}"
- name: Start Airflow worker service
community.docker.docker_compose_v2:
project_src: "{{ airflow_worker_dir }}"
files:
- "docker-compose-dl.yaml"
state: present
remove_orphans: true

View File

@ -0,0 +1,17 @@
#!/bin/sh
set -e
# Wait for MinIO to be ready
until (mc alias set local http://minio:9000 admin 0153093693-0009) do
echo 'Waiting for MinIO...'
sleep 1
done
# Create bucket if it doesn't exist
if ! mc ls local/airflow-logs >/dev/null 2>&1; then
mc mb local/airflow-logs
mc anonymous set download local/airflow-logs
echo 'MinIO bucket initialized'
else
echo 'MinIO bucket already exists'
fi

7
airflow/update-yt-dlp.sh Normal file
View File

@ -0,0 +1,7 @@
#!/bin/bash
# update-yt-dlp.sh run by the worker container before every DAG execution
set -e
echo "[$(date)] Updating yt-dlp to latest nightly master..."
python3 -m pip install -U --pre "yt-dlp[default]" --upgrade-strategy eager --force-reinstall --no-cache-dir
echo "[$(date)] yt-dlp updated to:"
yt-dlp --version

View File

@ -0,0 +1,564 @@
#
# Autogenerated by Thrift Compiler (0.20.0)
#
# DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
#
# options string: py
#
from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
from thrift.protocol.TProtocol import TProtocolException
from thrift.TRecursive import fix_spec
import sys
import logging
from .ttypes import *
from thrift.Thrift import TProcessor
from thrift.transport import TTransport
all_structs = []
class Iface(object):
def ping(self):
pass
def reportError(self, message, details):
"""
Parameters:
- message
- details
"""
pass
def shutdown(self):
pass
class Client(Iface):
def __init__(self, iprot, oprot=None):
self._iprot = self._oprot = iprot
if oprot is not None:
self._oprot = oprot
self._seqid = 0
def ping(self):
self.send_ping()
return self.recv_ping()
def send_ping(self):
self._oprot.writeMessageBegin('ping', TMessageType.CALL, self._seqid)
args = ping_args()
args.write(self._oprot)
self._oprot.writeMessageEnd()
self._oprot.trans.flush()
def recv_ping(self):
iprot = self._iprot
(fname, mtype, rseqid) = iprot.readMessageBegin()
if mtype == TMessageType.EXCEPTION:
x = TApplicationException()
x.read(iprot)
iprot.readMessageEnd()
raise x
result = ping_result()
result.read(iprot)
iprot.readMessageEnd()
if result.success is not None:
return result.success
if result.serviceExp is not None:
raise result.serviceExp
if result.userExp is not None:
raise result.userExp
raise TApplicationException(TApplicationException.MISSING_RESULT, "ping failed: unknown result")
def reportError(self, message, details):
"""
Parameters:
- message
- details
"""
self.send_reportError(message, details)
return self.recv_reportError()
def send_reportError(self, message, details):
self._oprot.writeMessageBegin('reportError', TMessageType.CALL, self._seqid)
args = reportError_args()
args.message = message
args.details = details
args.write(self._oprot)
self._oprot.writeMessageEnd()
self._oprot.trans.flush()
def recv_reportError(self):
iprot = self._iprot
(fname, mtype, rseqid) = iprot.readMessageBegin()
if mtype == TMessageType.EXCEPTION:
x = TApplicationException()
x.read(iprot)
iprot.readMessageEnd()
raise x
result = reportError_result()
result.read(iprot)
iprot.readMessageEnd()
if result.success is not None:
return result.success
if result.serviceExp is not None:
raise result.serviceExp
if result.userExp is not None:
raise result.userExp
raise TApplicationException(TApplicationException.MISSING_RESULT, "reportError failed: unknown result")
def shutdown(self):
self.send_shutdown()
def send_shutdown(self):
self._oprot.writeMessageBegin('shutdown', TMessageType.ONEWAY, self._seqid)
args = shutdown_args()
args.write(self._oprot)
self._oprot.writeMessageEnd()
self._oprot.trans.flush()
class Processor(Iface, TProcessor):
def __init__(self, handler):
self._handler = handler
self._processMap = {}
self._processMap["ping"] = Processor.process_ping
self._processMap["reportError"] = Processor.process_reportError
self._processMap["shutdown"] = Processor.process_shutdown
self._on_message_begin = None
def on_message_begin(self, func):
self._on_message_begin = func
def process(self, iprot, oprot):
(name, type, seqid) = iprot.readMessageBegin()
if self._on_message_begin:
self._on_message_begin(name, type, seqid)
if name not in self._processMap:
iprot.skip(TType.STRUCT)
iprot.readMessageEnd()
x = TApplicationException(TApplicationException.UNKNOWN_METHOD, 'Unknown function %s' % (name))
oprot.writeMessageBegin(name, TMessageType.EXCEPTION, seqid)
x.write(oprot)
oprot.writeMessageEnd()
oprot.trans.flush()
return
else:
self._processMap[name](self, seqid, iprot, oprot)
return True
def process_ping(self, seqid, iprot, oprot):
args = ping_args()
args.read(iprot)
iprot.readMessageEnd()
result = ping_result()
try:
result.success = self._handler.ping()
msg_type = TMessageType.REPLY
except TTransport.TTransportException:
raise
except pangramia.yt.exceptions.ttypes.PBServiceException as serviceExp:
msg_type = TMessageType.REPLY
result.serviceExp = serviceExp
except pangramia.yt.exceptions.ttypes.PBUserException as userExp:
msg_type = TMessageType.REPLY
result.userExp = userExp
except TApplicationException as ex:
logging.exception('TApplication exception in handler')
msg_type = TMessageType.EXCEPTION
result = ex
except Exception:
logging.exception('Unexpected exception in handler')
msg_type = TMessageType.EXCEPTION
result = TApplicationException(TApplicationException.INTERNAL_ERROR, 'Internal error')
oprot.writeMessageBegin("ping", msg_type, seqid)
result.write(oprot)
oprot.writeMessageEnd()
oprot.trans.flush()
def process_reportError(self, seqid, iprot, oprot):
args = reportError_args()
args.read(iprot)
iprot.readMessageEnd()
result = reportError_result()
try:
result.success = self._handler.reportError(args.message, args.details)
msg_type = TMessageType.REPLY
except TTransport.TTransportException:
raise
except pangramia.yt.exceptions.ttypes.PBServiceException as serviceExp:
msg_type = TMessageType.REPLY
result.serviceExp = serviceExp
except pangramia.yt.exceptions.ttypes.PBUserException as userExp:
msg_type = TMessageType.REPLY
result.userExp = userExp
except TApplicationException as ex:
logging.exception('TApplication exception in handler')
msg_type = TMessageType.EXCEPTION
result = ex
except Exception:
logging.exception('Unexpected exception in handler')
msg_type = TMessageType.EXCEPTION
result = TApplicationException(TApplicationException.INTERNAL_ERROR, 'Internal error')
oprot.writeMessageBegin("reportError", msg_type, seqid)
result.write(oprot)
oprot.writeMessageEnd()
oprot.trans.flush()
def process_shutdown(self, seqid, iprot, oprot):
args = shutdown_args()
args.read(iprot)
iprot.readMessageEnd()
try:
self._handler.shutdown()
except TTransport.TTransportException:
raise
except Exception:
logging.exception('Exception in oneway handler')
# HELPER FUNCTIONS AND STRUCTURES
class ping_args(object):
def read(self, iprot):
if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
return
iprot.readStructBegin()
while True:
(fname, ftype, fid) = iprot.readFieldBegin()
if ftype == TType.STOP:
break
else:
iprot.skip(ftype)
iprot.readFieldEnd()
iprot.readStructEnd()
def write(self, oprot):
if oprot._fast_encode is not None and self.thrift_spec is not None:
oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
return
oprot.writeStructBegin('ping_args')
oprot.writeFieldStop()
oprot.writeStructEnd()
def validate(self):
return
def __repr__(self):
L = ['%s=%r' % (key, value)
for key, value in self.__dict__.items()]
return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
def __eq__(self, other):
return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
def __ne__(self, other):
return not (self == other)
all_structs.append(ping_args)
ping_args.thrift_spec = (
)
class ping_result(object):
"""
Attributes:
- success
- serviceExp
- userExp
"""
def __init__(self, success=None, serviceExp=None, userExp=None,):
self.success = success
self.serviceExp = serviceExp
self.userExp = userExp
def read(self, iprot):
if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
return
iprot.readStructBegin()
while True:
(fname, ftype, fid) = iprot.readFieldBegin()
if ftype == TType.STOP:
break
if fid == 0:
if ftype == TType.BOOL:
self.success = iprot.readBool()
else:
iprot.skip(ftype)
elif fid == 1:
if ftype == TType.STRUCT:
self.serviceExp = pangramia.yt.exceptions.ttypes.PBServiceException.read(iprot)
else:
iprot.skip(ftype)
elif fid == 2:
if ftype == TType.STRUCT:
self.userExp = pangramia.yt.exceptions.ttypes.PBUserException.read(iprot)
else:
iprot.skip(ftype)
else:
iprot.skip(ftype)
iprot.readFieldEnd()
iprot.readStructEnd()
def write(self, oprot):
if oprot._fast_encode is not None and self.thrift_spec is not None:
oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
return
oprot.writeStructBegin('ping_result')
if self.success is not None:
oprot.writeFieldBegin('success', TType.BOOL, 0)
oprot.writeBool(self.success)
oprot.writeFieldEnd()
if self.serviceExp is not None:
oprot.writeFieldBegin('serviceExp', TType.STRUCT, 1)
self.serviceExp.write(oprot)
oprot.writeFieldEnd()
if self.userExp is not None:
oprot.writeFieldBegin('userExp', TType.STRUCT, 2)
self.userExp.write(oprot)
oprot.writeFieldEnd()
oprot.writeFieldStop()
oprot.writeStructEnd()
def validate(self):
return
def __repr__(self):
L = ['%s=%r' % (key, value)
for key, value in self.__dict__.items()]
return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
def __eq__(self, other):
return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
def __ne__(self, other):
return not (self == other)
all_structs.append(ping_result)
ping_result.thrift_spec = (
(0, TType.BOOL, 'success', None, None, ), # 0
(1, TType.STRUCT, 'serviceExp', [pangramia.yt.exceptions.ttypes.PBServiceException, None], None, ), # 1
(2, TType.STRUCT, 'userExp', [pangramia.yt.exceptions.ttypes.PBUserException, None], None, ), # 2
)
class reportError_args(object):
"""
Attributes:
- message
- details
"""
def __init__(self, message=None, details=None,):
self.message = message
self.details = details
def read(self, iprot):
if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
return
iprot.readStructBegin()
while True:
(fname, ftype, fid) = iprot.readFieldBegin()
if ftype == TType.STOP:
break
if fid == 1:
if ftype == TType.STRING:
self.message = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
else:
iprot.skip(ftype)
elif fid == 2:
if ftype == TType.MAP:
self.details = {}
(_ktype1, _vtype2, _size0) = iprot.readMapBegin()
for _i4 in range(_size0):
_key5 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
_val6 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
self.details[_key5] = _val6
iprot.readMapEnd()
else:
iprot.skip(ftype)
else:
iprot.skip(ftype)
iprot.readFieldEnd()
iprot.readStructEnd()
def write(self, oprot):
if oprot._fast_encode is not None and self.thrift_spec is not None:
oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
return
oprot.writeStructBegin('reportError_args')
if self.message is not None:
oprot.writeFieldBegin('message', TType.STRING, 1)
oprot.writeString(self.message.encode('utf-8') if sys.version_info[0] == 2 else self.message)
oprot.writeFieldEnd()
if self.details is not None:
oprot.writeFieldBegin('details', TType.MAP, 2)
oprot.writeMapBegin(TType.STRING, TType.STRING, len(self.details))
for kiter7, viter8 in self.details.items():
oprot.writeString(kiter7.encode('utf-8') if sys.version_info[0] == 2 else kiter7)
oprot.writeString(viter8.encode('utf-8') if sys.version_info[0] == 2 else viter8)
oprot.writeMapEnd()
oprot.writeFieldEnd()
oprot.writeFieldStop()
oprot.writeStructEnd()
def validate(self):
return
def __repr__(self):
L = ['%s=%r' % (key, value)
for key, value in self.__dict__.items()]
return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
def __eq__(self, other):
return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
def __ne__(self, other):
return not (self == other)
all_structs.append(reportError_args)
reportError_args.thrift_spec = (
None, # 0
(1, TType.STRING, 'message', 'UTF8', None, ), # 1
(2, TType.MAP, 'details', (TType.STRING, 'UTF8', TType.STRING, 'UTF8', False), None, ), # 2
)
class reportError_result(object):
"""
Attributes:
- success
- serviceExp
- userExp
"""
def __init__(self, success=None, serviceExp=None, userExp=None,):
self.success = success
self.serviceExp = serviceExp
self.userExp = userExp
def read(self, iprot):
if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
return
iprot.readStructBegin()
while True:
(fname, ftype, fid) = iprot.readFieldBegin()
if ftype == TType.STOP:
break
if fid == 0:
if ftype == TType.BOOL:
self.success = iprot.readBool()
else:
iprot.skip(ftype)
elif fid == 1:
if ftype == TType.STRUCT:
self.serviceExp = pangramia.yt.exceptions.ttypes.PBServiceException.read(iprot)
else:
iprot.skip(ftype)
elif fid == 2:
if ftype == TType.STRUCT:
self.userExp = pangramia.yt.exceptions.ttypes.PBUserException.read(iprot)
else:
iprot.skip(ftype)
else:
iprot.skip(ftype)
iprot.readFieldEnd()
iprot.readStructEnd()
def write(self, oprot):
if oprot._fast_encode is not None and self.thrift_spec is not None:
oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
return
oprot.writeStructBegin('reportError_result')
if self.success is not None:
oprot.writeFieldBegin('success', TType.BOOL, 0)
oprot.writeBool(self.success)
oprot.writeFieldEnd()
if self.serviceExp is not None:
oprot.writeFieldBegin('serviceExp', TType.STRUCT, 1)
self.serviceExp.write(oprot)
oprot.writeFieldEnd()
if self.userExp is not None:
oprot.writeFieldBegin('userExp', TType.STRUCT, 2)
self.userExp.write(oprot)
oprot.writeFieldEnd()
oprot.writeFieldStop()
oprot.writeStructEnd()
def validate(self):
return
def __repr__(self):
L = ['%s=%r' % (key, value)
for key, value in self.__dict__.items()]
return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
def __eq__(self, other):
return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
def __ne__(self, other):
return not (self == other)
all_structs.append(reportError_result)
reportError_result.thrift_spec = (
(0, TType.BOOL, 'success', None, None, ), # 0
(1, TType.STRUCT, 'serviceExp', [pangramia.yt.exceptions.ttypes.PBServiceException, None], None, ), # 1
(2, TType.STRUCT, 'userExp', [pangramia.yt.exceptions.ttypes.PBUserException, None], None, ), # 2
)
class shutdown_args(object):
def read(self, iprot):
if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
return
iprot.readStructBegin()
while True:
(fname, ftype, fid) = iprot.readFieldBegin()
if ftype == TType.STOP:
break
else:
iprot.skip(ftype)
iprot.readFieldEnd()
iprot.readStructEnd()
def write(self, oprot):
if oprot._fast_encode is not None and self.thrift_spec is not None:
oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
return
oprot.writeStructBegin('shutdown_args')
oprot.writeFieldStop()
oprot.writeStructEnd()
def validate(self):
return
def __repr__(self):
L = ['%s=%r' % (key, value)
for key, value in self.__dict__.items()]
return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
def __eq__(self, other):
return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
def __ne__(self, other):
return not (self == other)
all_structs.append(shutdown_args)
shutdown_args.thrift_spec = (
)
fix_spec(all_structs)
del all_structs

View File

@ -0,0 +1 @@
__all__ = ['ttypes', 'constants', 'BaseService']

View File

@ -0,0 +1,14 @@
#
# Autogenerated by Thrift Compiler (0.20.0)
#
# DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
#
# options string: py
#
from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
from thrift.protocol.TProtocol import TProtocolException
from thrift.TRecursive import fix_spec
import sys
from .ttypes import *

View File

@ -0,0 +1,20 @@
#
# Autogenerated by Thrift Compiler (0.20.0)
#
# DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
#
# options string: py
#
from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
from thrift.protocol.TProtocol import TProtocolException
from thrift.TRecursive import fix_spec
import sys
import pangramia.yt.common.ttypes
import pangramia.yt.exceptions.ttypes
from thrift.transport import TTransport
all_structs = []
fix_spec(all_structs)
del all_structs

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
__all__ = ['ttypes', 'constants', 'YTAccountsOpService']

View File

@ -0,0 +1,14 @@
#
# Autogenerated by Thrift Compiler (0.20.0)
#
# DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
#
# options string: py
#
from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
from thrift.protocol.TProtocol import TProtocolException
from thrift.TRecursive import fix_spec
import sys
from .ttypes import *

View File

@ -0,0 +1,21 @@
#
# Autogenerated by Thrift Compiler (0.20.0)
#
# DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
#
# options string: py
#
from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
from thrift.protocol.TProtocol import TProtocolException
from thrift.TRecursive import fix_spec
import sys
import pangramia.yt.common.ttypes
import pangramia.yt.exceptions.ttypes
import pangramia.base_service.ttypes
from thrift.transport import TTransport
all_structs = []
fix_spec(all_structs)
del all_structs

Some files were not shown because too many files have changed in this diff Show More