yt-dlp-dags/airflow/bak/docker-compose-dl.yaml
2025-08-26 18:00:55 +03:00

144 lines
5.9 KiB
YAML

# Airflow remote DL worker configuration.
# This file should be used on a remote machine to run a download worker.
# It requires a master Airflow instance running with services exposed.
#
# Before running, create a .env file in this directory with:
# MASTER_HOST_IP=... a.b.c.d ... # IP address of the machine running docker-compose-master.yaml
# POSTGRES_PASSWORD=... # The password for the PostgreSQL database from the master compose file
# REDIS_PASSWORD=... # The password for Redis from the master compose file
# AIRFLOW_UID=... # User ID for file permissions, should match master
---
x-airflow-common:
&airflow-common
# This should point to the same image used by the master.
# If you built a custom image for master, you need to push it to a registry
# and reference it here.
image: ${AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest}
build: .
# Add extra hosts here to allow workers to resolve other hosts by name.
# This section is auto-generated from cluster.yml
extra_hosts:
{% for host_name, host_ip in all_hosts.items() %}
- "{{ host_name }}:{{ host_ip }}"
{% endfor %}
env_file:
- .env
environment:
&airflow-common-env
# Airflow Core
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
AIRFLOW__CORE__FERNET_KEY: '' # Should be same as master, but worker does not need it.
# Backend connections - These should point to the master node
# Set MASTER_HOST_IP, POSTGRES_PASSWORD, and REDIS_PASSWORD in your .env file
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}@${MASTER_HOST_IP}:5432/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT}@${MASTER_HOST_IP}:52909/0
AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}@${MASTER_HOST_IP}:5432/airflow
# Remote Logging - connection is fetched from DB, which is on master
AIRFLOW__LOGGING__REMOTE_LOGGING: "True"
AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs"
AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default
AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False"
AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py"
volumes:
# Mount dags to get any utility scripts, but the worker will pull the DAG from the DB
- ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
# Mount logs locally in case remote logging fails
- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
# Mount config for local settings and other configurations
- ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
# Mount download directories
- ${AIRFLOW_PROJ_DIR:-.}/downloadfiles:/opt/airflow/downloadfiles
- ${AIRFLOW_PROJ_DIR:-.}/addfiles:/opt/airflow/addfiles
- ${AIRFLOW_PROJ_DIR:-.}/inputfiles:/opt/airflow/inputfiles
# Use AIRFLOW_UID and AIRFLOW_GID from .env file to fix permission issues.
user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-0}"
services:
airflow-worker:
<<: *airflow-common
container_name: airflow-dl-worker-1
hostname: ${HOSTNAME:-dl001}
# The worker now listens on the generic queue AND its own dedicated queue.
# The hostname is dynamically inserted into the queue name.
command: airflow celery worker -q queue-dl,queue-dl-${HOSTNAME:-dl001}
deploy:
resources:
limits:
# Increased from 4G to 8G to support higher memory per child process.
memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT:-8G}
reservations:
memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV:-2G}
healthcheck:
test:
- "CMD-SHELL"
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-dl@$$(hostname)"'
interval: 30s
timeout: 30s
retries: 5
start_period: 30s
environment:
<<: *airflow-common-env
HOSTNAME: ${HOSTNAME:-dl001} # Explicitly set inside container
DUMB_INIT_SETSID: "0"
AIRFLOW__CELERY__WORKER_QUEUES: "queue-dl,queue-dl-${HOSTNAME:-dl001}"
AIRFLOW__CELERY__WORKER_TAGS: "dl"
AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
AIRFLOW__CELERY__WORKER_CONCURRENCY: ${AIRFLOW_WORKER_DOWNLOAD_CONCURRENCY:-16}
AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
AIRFLOW__CELERY__WORKER_NAME: "worker-dl@%h"
AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
# Increased from 256MB to 512MB for memory-intensive yt-dlp tasks.
# This value is in KB. 512 * 1024 = 524288.
AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "524288" # 512MB
# The hostname is now managed by Docker Compose to ensure uniqueness when scaling.
# It will be generated based on project, service, and replica number (e.g., airflow-airflow-dl-worker-1).
# hostname: "dl-worker-${HOSTNAME_SUFFIX:-$$(hostname)}"
ports:
- "8793:8793"
networks:
- default
- proxynet
restart: always
airflow-triggerer:
<<: *airflow-common
container_name: airflow-dl-triggerer-1
hostname: ${HOSTNAME}
command: triggerer
healthcheck:
test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
interval: 30s
timeout: 30s
retries: 5
start_period: 60s
environment:
<<: *airflow-common-env
PYTHONASYNCIODEBUG: 1
DUMB_INIT_SETSID: 0
restart: always
docker-socket-proxy:
profiles:
- disabled
image: tecnativa/docker-socket-proxy:0.1.1
environment:
CONTAINERS: 1
IMAGES: 1
AUTH: 1
POST: 1
privileged: true
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
restart: always
networks:
proxynet:
name: airflow_proxynet
external: true