# Airflow remote DL worker configuration. # This file should be used on a remote machine to run a download worker. # It requires a master Airflow instance running with services exposed. # # Before running, create a .env file in this directory with: # MASTER_HOST_IP=... a.b.c.d ... # IP address of the machine running docker-compose-master.yaml # POSTGRES_PASSWORD=... # The password for the PostgreSQL database from the master compose file # REDIS_PASSWORD=... # The password for Redis from the master compose file # AIRFLOW_UID=... # User ID for file permissions, should match master --- x-airflow-common: &airflow-common # This should point to the same image used by the master. # If you built a custom image for master, you need to push it to a registry # and reference it here. image: ${AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest} build: . # Add extra hosts here to allow workers to resolve other hosts by name. # This section is auto-generated by Ansible from the inventory. extra_hosts: {% for host in groups['all'] %} - "{{ hostvars[host]['inventory_hostname'] }}:{{ hostvars[host]['ansible_host'] }}" {% endfor %} env_file: # The .env file is located in the project root (e.g., /srv/airflow_dl_worker), # so we provide an absolute path to it. - "{{ airflow_worker_dir }}/.env" environment: &airflow-common-env AIRFLOW__CORE__PARALLELISM: 64 AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG: 32 AIRFLOW__SCHEDULER__PARSING_PROCESSES: 4 AIRFLOW__WEBSERVER__WORKERS: 5 AIRFLOW__WEBSERVER__WORKER_CLASS: "gevent" AIRFLOW__LOGGING__SECRET_MASK_EXCEPTION_ARGS: False # Prevent slow webserver when low memory? GUNICORN_CMD_ARGS: --max-requests 20 --max-requests-jitter 3 --worker-tmp-dir /dev/shm # Airflow Core AIRFLOW__CORE__EXECUTOR: CeleryExecutor AIRFLOW__CORE__LOAD_EXAMPLES: 'false' AIRFLOW__CORE__FERNET_KEY: '' # Should be same as master, but worker does not need it. # Backend connections - These should point to the master node # Set MASTER_HOST_IP, POSTGRES_PASSWORD, and REDIS_PASSWORD in your .env file AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${{ '{' }}POSTGRES_PASSWORD{{ '}' }}@${{ '{' }}MASTER_HOST_IP{{ '}' }}:{{ postgres_port }}/airflow IRFLOW__CELERY__RESULT_BACKEND: db+postgresql+psycopg2://airflow:${{ '{' }}POSTGRES_PASSWORD{{ '}' }}@${{ '{' }}MASTER_HOST_IP{{ '}' }}:{{ postgres_port }}/airflow AIRFLOW__CELERY__BROKER_URL: redis://:${REDIS_PASSWORD}@${MASTER_HOST_IP}:52909/0 # Remote Logging - connection is configured directly via environment variables #_PIP_ADDITIONAL_REQUIREMENTS: ${{ '{' }}_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker apache-airflow-providers-http thrift>=0.16.0,<=0.20.0 backoff>=2.2.1 python-dotenv==1.0.1 psutil>=5.9.0 apache-airflow-providers-amazon{{ '}' }} AIRFLOW__LOGGING__REMOTE_LOGGING: "True" AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs" AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False" #AIRFLOW__LOGGING__LOG_ID_TEMPLATE: "{dag_id}-{task_id}-{run_id}-{try_number}" AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ==' AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ==' AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py" volumes: # Mount dags to get any utility scripts, but the worker will pull the DAG from the DB - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags # Mount logs locally in case remote logging fails - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs # Mount config for local settings and other configurations - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config - ${AIRFLOW_PROJ_DIR:-.}/config/airflow.cfg:/opt/airflow/airflow.cfg # Mount download directories - ${AIRFLOW_PROJ_DIR:-.}/downloadfiles:/opt/airflow/downloadfiles - ${AIRFLOW_PROJ_DIR:-.}/addfiles:/opt/airflow/addfiles - ${AIRFLOW_PROJ_DIR:-.}/inputfiles:/opt/airflow/inputfiles # Use AIRFLOW_UID from .env file to fix permission issues. user: "${AIRFLOW_UID:-50000}" services: airflow-worker: <<: *airflow-common container_name: airflow-dl-worker-1 hostname: ${HOSTNAME:-dl001} # The worker now listens on the generic queue AND its own dedicated queue. # The hostname is dynamically inserted into the queue name. command: airflow celery worker -q queue-dl,queue-dl-${HOSTNAME:-dl001} deploy: resources: limits: # Increased from 4G to 8G to support higher memory per child process. memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT:-8G} reservations: memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV:-2G} healthcheck: test: - "CMD-SHELL" - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-dl@$$(hostname)"' interval: 30s timeout: 30s retries: 5 start_period: 30s environment: <<: *airflow-common-env HOSTNAME: ${HOSTNAME:-dl001} # Explicitly set inside container DUMB_INIT_SETSID: "0" AIRFLOW__CELERY__WORKER_QUEUES: "queue-dl,queue-dl-${HOSTNAME:-dl001}" AIRFLOW__CELERY__WORKER_TAGS: "dl" AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1" AIRFLOW__CELERY__WORKER_CONCURRENCY: ${AIRFLOW_WORKER_DOWNLOAD_CONCURRENCY:-16} # Use prefork pool for better compatibility with blocking libraries. AIRFLOW__CELERY__POOL: "prefork" AIRFLOW__CELERY__TASK_ACKS_LATE: "False" AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0" AIRFLOW__CELERY__WORKER_NAME: "worker-dl@%h" AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100" # Increased from 256MB to 512MB for memory-intensive yt-dlp tasks. # This value is in KB. 512 * 1024 = 524288. AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "524288" # 512MB # The hostname is now managed by Docker Compose to ensure uniqueness when scaling. # It will be generated based on project, service, and replica number (e.g., airflow-airflow-dl-worker-1). # hostname: "dl-worker-${HOSTNAME_SUFFIX:-$$(hostname)}" ports: - "8793:8793" networks: - default - proxynet restart: always docker-socket-proxy: profiles: - disabled image: tecnativa/docker-socket-proxy:0.1.1 environment: CONTAINERS: 1 IMAGES: 1 AUTH: 1 POST: 1 privileged: true volumes: - /var/run/docker.sock:/var/run/docker.sock:ro restart: always networks: proxynet: name: airflow_proxynet external: true