# Airflow remote DL worker configuration. # This file should be used on a remote machine to run a download worker. # It requires a master Airflow instance running with services exposed. # # Before running, create a .env file in this directory with: # MASTER_HOST_IP=... a.b.c.d ... # IP address of the machine running docker-compose-master.yaml # POSTGRES_PASSWORD=... # The password for the PostgreSQL database from the master compose file # REDIS_PASSWORD=... # The password for Redis from the master compose file # AIRFLOW_UID=... # User ID for file permissions, should match master --- x-airflow-common: &airflow-common # This should point to the same image used by the master. # If you built a custom image for master, you need to push it to a registry # and reference it here. image: ${AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest} # Add extra hosts here to allow workers to resolve other hosts by name. # This section is auto-generated by Ansible from the inventory. extra_hosts: {% for host in groups['all'] %} - "{{ hostvars[host]['inventory_hostname'] }}:{{ hostvars[host]['ansible_host'] | default(hostvars[host]['inventory_hostname']) }}" {% endfor %} env_file: # The .env file is located in the project root (e.g., /srv/airflow_dl_worker), # so we provide an absolute path to it. - "{{ airflow_worker_dir }}/.env" environment: &airflow-common-env AIRFLOW__CORE__PARALLELISM: 128 AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG: 64 AIRFLOW__SCHEDULER__PARSING_PROCESSES: 8 AIRFLOW__WEBSERVER__WORKERS: 5 AIRFLOW__WEBSERVER__WORKER_CLASS: "gevent" AIRFLOW__LOGGING__SECRET_MASK_EXCEPTION_ARGS: False # Prevent slow webserver when low memory? GUNICORN_CMD_ARGS: --max-requests 20 --max-requests-jitter 3 --worker-tmp-dir /dev/shm # Airflow Core AIRFLOW__CORE__EXECUTOR: CeleryExecutor AIRFLOW__CORE__LOAD_EXAMPLES: 'false' AIRFLOW__CORE__FERNET_KEY: '' # Should be same as master, but worker does not need it. # Backend connections - These should point to the master node # Set MASTER_HOST_IP, POSTGRES_PASSWORD, and REDIS_PASSWORD in your .env file AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${{ '{' }}POSTGRES_PASSWORD{{ '}' }}@${{ '{' }}MASTER_HOST_IP{{ '}' }}:{{ postgres_port }}/airflow AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql+psycopg2://airflow:${{ '{' }}POSTGRES_PASSWORD{{ '}' }}@${{ '{' }}MASTER_HOST_IP{{ '}' }}:{{ postgres_port }}/airflow AIRFLOW__CELERY__BROKER_URL: redis://:${REDIS_PASSWORD}@${MASTER_HOST_IP}:{{ redis_port }}/0 # Remote Logging - connection is configured directly via environment variables #_PIP_ADDITIONAL_REQUIREMENTS: ${{ '{' }}_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker apache-airflow-providers-http thrift>=0.16.0,<=0.20.0 backoff>=2.2.1 python-dotenv==1.0.1 psutil>=5.9.0 apache-airflow-providers-amazon{{ '}' }} AIRFLOW__LOGGING__REMOTE_LOGGING: "True" AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs" AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False" #AIRFLOW__LOGGING__LOG_ID_TEMPLATE: "{dag_id}-{task_id}-{run_id}-{try_number}" AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ==' AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ==' AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py" volumes: # Mount dags to get any utility scripts, but the worker will pull the DAG from the DB - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags # Mount logs locally in case remote logging fails - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs # Mount config for local settings and other configurations - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config - ${AIRFLOW_PROJ_DIR:-.}/config/airflow.cfg:/opt/airflow/airflow.cfg # Mount download directories - ${AIRFLOW_PROJ_DIR:-.}/downloadfiles:/opt/airflow/downloadfiles - ${AIRFLOW_PROJ_DIR:-.}/addfiles:/opt/airflow/addfiles - ${AIRFLOW_PROJ_DIR:-.}/inputfiles:/opt/airflow/inputfiles # Mount the generated pangramia package to ensure workers have the latest version - ${AIRFLOW_PROJ_DIR:-.}/pangramia:/app/pangramia # Use AIRFLOW_UID from .env file to fix permission issues. GID is set to 0 for compatibility with the Airflow image. user: "${{ '{' }}AIRFLOW_UID:-50000{{ '}' }}:0" services: airflow-worker-dl: <<: *airflow-common container_name: airflow-worker-dl-1 hostname: ${HOSTNAME:-dl001} # The DL worker listens on the generic dl queue AND its own dedicated queue. command: airflow celery worker -q queue-dl,queue-dl-${HOSTNAME:-dl001} deploy: resources: limits: memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT:-8G} reservations: memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV:-2G} healthcheck: test: - "CMD-SHELL" - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-dl@$$(hostname)"' interval: 30s timeout: 30s retries: 5 start_period: 30s environment: <<: *airflow-common-env HOSTNAME: ${HOSTNAME:-dl001} DUMB_INIT_SETSID: "0" AIRFLOW__CELERY__WORKER_QUEUES: "queue-dl,queue-dl-${HOSTNAME:-dl001}" AIRFLOW__CELERY__WORKER_TAGS: "dl" AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1" AIRFLOW__CELERY__WORKER_AUTOSCALE: "16,8" AIRFLOW__CELERY__POOL: "prefork" AIRFLOW__CELERY__TASK_ACKS_LATE: "False" AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0" AIRFLOW__CELERY__WORKER_NAME: "worker-dl@%h" AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100" AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "524288" # 512MB ports: - "8793:8793" networks: - default - proxynet restart: always airflow-worker-auth: <<: *airflow-common container_name: airflow-worker-auth-1 hostname: ${HOSTNAME:-auth001} # The Auth worker listens on the generic auth queue AND its own dedicated queue. command: airflow celery worker -q queue-auth,queue-auth-${HOSTNAME:-auth001} deploy: resources: limits: memory: ${AIRFLOW_WORKER_AUTH_MEM_LIMIT:-4G} reservations: memory: ${AIRFLOW_WORKER_AUTH_MEM_RESERV:-1G} healthcheck: test: - "CMD-SHELL" - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-auth@$$(hostname)"' interval: 30s timeout: 30s retries: 5 start_period: 30s environment: <<: *airflow-common-env HOSTNAME: ${HOSTNAME:-auth001} DUMB_INIT_SETSID: "0" AIRFLOW__CELERY__WORKER_QUEUES: "queue-auth,queue-auth-${HOSTNAME:-auth001}" AIRFLOW__CELERY__WORKER_TAGS: "auth" AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1" # Auth tasks are less resource intensive but we want fewer of them to avoid service overload. AIRFLOW__CELERY__WORKER_AUTOSCALE: "2,1" AIRFLOW__CELERY__POOL: "prefork" AIRFLOW__CELERY__TASK_ACKS_LATE: "False" AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0" AIRFLOW__CELERY__WORKER_NAME: "worker-auth@%h" AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100" AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "262144" # 256MB networks: - default - proxynet restart: always docker-socket-proxy: profiles: - disabled image: tecnativa/docker-socket-proxy:0.1.1 environment: CONTAINERS: 1 IMAGES: 1 AUTH: 1 POST: 1 privileged: true volumes: - /var/run/docker.sock:/var/run/docker.sock:ro restart: always networks: proxynet: name: airflow_proxynet external: true