149 lines
6.5 KiB
Django/Jinja
149 lines
6.5 KiB
Django/Jinja
# Airflow remote DL worker configuration.
|
|
# This file should be used on a remote machine to run a download worker.
|
|
# It requires a master Airflow instance running with services exposed.
|
|
#
|
|
# Before running, create a .env file in this directory with:
|
|
# MASTER_HOST_IP=... a.b.c.d ... # IP address of the machine running docker-compose-master.yaml
|
|
# POSTGRES_PASSWORD=... # The password for the PostgreSQL database from the master compose file
|
|
# REDIS_PASSWORD=... # The password for Redis from the master compose file
|
|
# AIRFLOW_UID=... # User ID for file permissions, should match master
|
|
---
|
|
x-airflow-common:
|
|
&airflow-common
|
|
# This should point to the same image used by the master.
|
|
# If you built a custom image for master, you need to push it to a registry
|
|
# and reference it here.
|
|
image: ${AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest}
|
|
build: .
|
|
# Add extra hosts here to allow workers to resolve other hosts by name.
|
|
# This section is auto-generated by Ansible from the inventory.
|
|
extra_hosts:
|
|
{% for host in groups['all'] %}
|
|
- "{{ hostvars[host]['inventory_hostname'] }}:{{ hostvars[host]['ansible_host'] }}"
|
|
{% endfor %}
|
|
env_file:
|
|
# The .env file is located in the project root (e.g., /srv/airflow_dl_worker),
|
|
# so we provide an absolute path to it.
|
|
- "{{ airflow_worker_dir }}/.env"
|
|
environment:
|
|
&airflow-common-env
|
|
|
|
AIRFLOW__CORE__PARALLELISM: 64
|
|
AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG: 32
|
|
AIRFLOW__SCHEDULER__PARSING_PROCESSES: 4
|
|
AIRFLOW__WEBSERVER__WORKERS: 5
|
|
AIRFLOW__WEBSERVER__WORKER_CLASS: "gevent"
|
|
|
|
AIRFLOW__LOGGING__SECRET_MASK_EXCEPTION_ARGS: False
|
|
|
|
|
|
# Prevent slow webserver when low memory?
|
|
GUNICORN_CMD_ARGS: --max-requests 20 --max-requests-jitter 3 --worker-tmp-dir /dev/shm
|
|
|
|
|
|
# Airflow Core
|
|
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
|
|
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
|
|
AIRFLOW__CORE__FERNET_KEY: '' # Should be same as master, but worker does not need it.
|
|
|
|
# Backend connections - These should point to the master node
|
|
# Set MASTER_HOST_IP, POSTGRES_PASSWORD, and REDIS_PASSWORD in your .env file
|
|
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${{ '{' }}POSTGRES_PASSWORD{{ '}' }}@${{ '{' }}MASTER_HOST_IP{{ '}' }}:{{ postgres_port }}/airflow
|
|
IRFLOW__CELERY__RESULT_BACKEND: db+postgresql+psycopg2://airflow:${{ '{' }}POSTGRES_PASSWORD{{ '}' }}@${{ '{' }}MASTER_HOST_IP{{ '}' }}:{{ postgres_port }}/airflow
|
|
AIRFLOW__CELERY__BROKER_URL: redis://:${REDIS_PASSWORD}@${MASTER_HOST_IP}:52909/0
|
|
|
|
# Remote Logging - connection is configured directly via environment variables
|
|
#_PIP_ADDITIONAL_REQUIREMENTS: ${{ '{' }}_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker apache-airflow-providers-http thrift>=0.16.0,<=0.20.0 backoff>=2.2.1 python-dotenv==1.0.1 psutil>=5.9.0 apache-airflow-providers-amazon{{ '}' }}
|
|
AIRFLOW__LOGGING__REMOTE_LOGGING: "True"
|
|
AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs"
|
|
AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default
|
|
AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False"
|
|
#AIRFLOW__LOGGING__LOG_ID_TEMPLATE: "{dag_id}-{task_id}-{run_id}-{try_number}"
|
|
AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
|
|
AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
|
|
AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py"
|
|
|
|
volumes:
|
|
# Mount dags to get any utility scripts, but the worker will pull the DAG from the DB
|
|
- ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
|
|
# Mount logs locally in case remote logging fails
|
|
- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
|
|
# Mount config for local settings and other configurations
|
|
- ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
|
|
- ${AIRFLOW_PROJ_DIR:-.}/config/airflow.cfg:/opt/airflow/airflow.cfg
|
|
# Mount download directories
|
|
- ${AIRFLOW_PROJ_DIR:-.}/downloadfiles:/opt/airflow/downloadfiles
|
|
- ${AIRFLOW_PROJ_DIR:-.}/addfiles:/opt/airflow/addfiles
|
|
- ${AIRFLOW_PROJ_DIR:-.}/inputfiles:/opt/airflow/inputfiles
|
|
# Use AIRFLOW_UID from .env file to fix permission issues.
|
|
user: "${AIRFLOW_UID:-50000}"
|
|
|
|
services:
|
|
airflow-worker:
|
|
<<: *airflow-common
|
|
container_name: airflow-dl-worker-1
|
|
hostname: ${HOSTNAME:-dl001}
|
|
# The worker now listens on the generic queue AND its own dedicated queue.
|
|
# The hostname is dynamically inserted into the queue name.
|
|
command: airflow celery worker -q queue-dl,queue-dl-${HOSTNAME:-dl001}
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
# Increased from 4G to 8G to support higher memory per child process.
|
|
memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT:-8G}
|
|
reservations:
|
|
memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV:-2G}
|
|
healthcheck:
|
|
test:
|
|
- "CMD-SHELL"
|
|
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-dl@$$(hostname)"'
|
|
interval: 30s
|
|
timeout: 30s
|
|
retries: 5
|
|
start_period: 30s
|
|
environment:
|
|
<<: *airflow-common-env
|
|
HOSTNAME: ${HOSTNAME:-dl001} # Explicitly set inside container
|
|
DUMB_INIT_SETSID: "0"
|
|
AIRFLOW__CELERY__WORKER_QUEUES: "queue-dl,queue-dl-${HOSTNAME:-dl001}"
|
|
AIRFLOW__CELERY__WORKER_TAGS: "dl"
|
|
AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
|
|
AIRFLOW__CELERY__WORKER_CONCURRENCY: ${AIRFLOW_WORKER_DOWNLOAD_CONCURRENCY:-16}
|
|
# Use prefork pool for better compatibility with blocking libraries.
|
|
AIRFLOW__CELERY__POOL: "prefork"
|
|
AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
|
|
AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
|
|
AIRFLOW__CELERY__WORKER_NAME: "worker-dl@%h"
|
|
AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
|
|
# Increased from 256MB to 512MB for memory-intensive yt-dlp tasks.
|
|
# This value is in KB. 512 * 1024 = 524288.
|
|
AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "524288" # 512MB
|
|
# The hostname is now managed by Docker Compose to ensure uniqueness when scaling.
|
|
# It will be generated based on project, service, and replica number (e.g., airflow-airflow-dl-worker-1).
|
|
# hostname: "dl-worker-${HOSTNAME_SUFFIX:-$$(hostname)}"
|
|
ports:
|
|
- "8793:8793"
|
|
networks:
|
|
- default
|
|
- proxynet
|
|
restart: always
|
|
|
|
docker-socket-proxy:
|
|
profiles:
|
|
- disabled
|
|
image: tecnativa/docker-socket-proxy:0.1.1
|
|
environment:
|
|
CONTAINERS: 1
|
|
IMAGES: 1
|
|
AUTH: 1
|
|
POST: 1
|
|
privileged: true
|
|
volumes:
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
restart: always
|
|
|
|
networks:
|
|
proxynet:
|
|
name: airflow_proxynet
|
|
external: true
|