# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. # # WARNING: This configuration is for local development. Do not use it in a production deployment. # # This configuration supports basic configuration using environment variables or an .env file # The following variables are supported: # # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. # Default: apache/airflow:2.10.5 # AIRFLOW_UID - User ID in Airflow containers # Default: 50000 # AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed. # Default: . # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode # # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). # Default: airflow # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). # Default: airflow # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. # Use this option ONLY for quick checks. Installing requirements at container # startup is done EVERY TIME the service is started. # A better way is to build a custom image or extend the official image # as described in https://airflow.apache.org/docs/docker-stack/build.html. # Default: '' # # Feel free to modify this file to suit your needs. --- name: airflow-master x-minio-common: &minio-common image: quay.io/minio/minio:RELEASE.2025-07-23T15-54-02Z command: server --console-address ":9001" http://minio{1...3}/data{1...2} expose: - "9000" - "9001" networks: - proxynet env_file: - .env environment: MINIO_ROOT_USER: ${{ '{' }}MINIO_ROOT_USER:-admin{{ '}' }} MINIO_ROOT_PASSWORD: ${{ '{' }}MINIO_ROOT_PASSWORD:-0153093693-0009{{ '}' }} healthcheck: test: ["CMD", "mc", "ready", "local"] interval: 5s timeout: 5s retries: 5 restart: always x-airflow-common: &airflow-common # In order to add custom dependencies or upgrade provider packages you can use your extended image. # This will build the image from the Dockerfile in this directory and tag it. image: ${{ '{' }}AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest{{ '}' }} build: . # Add extra hosts here to allow the master services (webserver, scheduler) to resolve # the hostnames of your remote DL workers. This is crucial for fetching logs. # Format: - "hostname:ip_address" # IMPORTANT: This section is auto-generated from cluster.yml extra_hosts: {% for host_name, host_ip in all_hosts.items() %} - "{{ host_name }}:{{ host_ip }}" {% endfor %} env_file: - .env networks: - proxynet environment: &airflow-common-env AIRFLOW__CORE__PARALLELISM: 64 AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG: 32 AIRFLOW__SCHEDULER__PARSING_PROCESSES: 4 AIRFLOW__CORE__EXECUTOR: CeleryExecutor AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${{ '{' }}POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP{{ '}' }}@postgres/airflow AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:${{ '{' }}POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP{{ '}' }}@postgres/airflow AIRFLOW__CELERY__BROKER_URL: redis://:${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }}@redis:6379/0 AIRFLOW__CORE__FERNET_KEY: '' AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' AIRFLOW__CORE__LOAD_EXAMPLES: 'false' AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg' AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ==' AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ==' # yamllint disable rule:line-length # Use simple http server on scheduler for health checks # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server # yamllint enable rule:line-length AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks # for other purpose (development, test and especially production usage) build/extend Airflow image. #_PIP_ADDITIONAL_REQUIREMENTS: ${{ '{' }}_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker apache-airflow-providers-http thrift>=0.16.0,<=0.20.0 backoff>=2.2.1 python-dotenv==1.0.1 psutil>=5.9.0{{ '}' }} # The following line can be used to set a custom config file, stored in the local config folder # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file AIRFLOW__LOGGING__REMOTE_LOGGING: "True" AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs" AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False" {% raw %} AIRFLOW__LOGGING__REMOTE_LOG_FORMAT: "[%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s" AIRFLOW__LOGGING__LOG_LEVEL: "INFO" AIRFLOW__LOGGING__LOG_FILENAME_TEMPLATE: "{{ ti.dag_id }}/{{ ti.run_id }}/{{ ti.task_id }}/attempt={{ try_number }}.log" {% endraw %} AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py" volumes: - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/dags:/opt/airflow/dags - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/logs:/opt/airflow/logs - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/config:/opt/airflow/config - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/plugins:/opt/airflow/plugins - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/downloadfiles:/opt/airflow/downloadfiles - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/addfiles:/opt/airflow/addfiles - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/inputfiles:/opt/airflow/inputfiles user: "${{ '{' }}AIRFLOW_UID:-50000{{ '}' }}:${{ '{' }}AIRFLOW_GID:-0{{ '}' }}" depends_on: &airflow-common-depends-on redis: condition: service_healthy postgres: condition: service_healthy nginx-minio-lb: condition: service_healthy services: postgres: image: postgres:13 env_file: - .env networks: - proxynet environment: POSTGRES_USER: airflow POSTGRES_PASSWORD: ${{ '{' }}POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP{{ '}' }} POSTGRES_DB: airflow volumes: - postgres-db-volume:/var/lib/postgresql/data ports: - "{{ postgres_port }}:5432" healthcheck: test: ["CMD", "pg_isready", "-U", "airflow"] interval: 10s retries: 5 start_period: 5s restart: always redis: # Redis is limited to 7.2-bookworm due to licencing change # https://redis.io/blog/redis-adopts-dual-source-available-licensing/ image: redis:7.2-bookworm env_file: - .env networks: - proxynet command: - "redis-server" - "--requirepass" - "${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }}" - "--bind" - "*" - "--protected-mode" - "no" - "--save" - "60" - "1" - "--loglevel" - "warning" - "--appendonly" - "yes" volumes: - ./redis-data:/data expose: - 6379 ports: - "{{ redis_port }}:6379" healthcheck: test: ["CMD", "redis-cli", "-a", "${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }}", "ping"] interval: 10s timeout: 30s retries: 50 start_period: 30s restart: always redis-proxy-account-clear: image: redis:7.2-bookworm container_name: redis-proxy-account-clear env_file: - .env networks: - proxynet command: > sh -c " echo 'Clearing proxy and account statuses from Redis...'; redis-cli -h redis -a $${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }} --scan --pattern 'proxy_status:*' | xargs -r redis-cli -h redis -a $${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }} DEL; redis-cli -h redis -a $${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }} --scan --pattern 'account_status:*' | xargs -r redis-cli -h redis -a $${{ '{' }}REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT{{ '}' }} DEL; echo 'Redis cleanup complete.' " depends_on: redis: condition: service_healthy minio1: <<: *minio-common hostname: minio1 volumes: - ./minio-data/1/1:/data1 - ./minio-data/1/2:/data2 minio2: <<: *minio-common hostname: minio2 volumes: - ./minio-data/2/1:/data1 - ./minio-data/2/2:/data2 depends_on: minio1: condition: service_started minio3: <<: *minio-common hostname: minio3 volumes: - ./minio-data/3/1:/data1 - ./minio-data/3/2:/data2 depends_on: minio2: condition: service_started nginx-minio-lb: image: nginx:1.19.2-alpine hostname: nginx-minio-lb networks: - proxynet command: sh -c "apk add --no-cache curl >/dev/null 2>&1 && exec nginx -g 'daemon off;'" volumes: - ./nginx.conf:/etc/nginx/nginx.conf:ro ports: - "9000:9000" - "9001:9001" healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9001/minio/health/live"] interval: 10s timeout: 5s retries: 5 start_period: 10s depends_on: minio1: condition: service_healthy minio2: condition: service_healthy minio3: condition: service_healthy restart: always minio-init: image: minio/mc container_name: minio-init networks: - proxynet depends_on: nginx-minio-lb: condition: service_healthy entrypoint: > /bin/sh -c " set -e; /usr/bin/mc alias set minio http://nginx-minio-lb:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD; # Retry loop for bucket creation MAX_ATTEMPTS=10 SUCCESS=false # Use a for loop for robustness, as it's generally more portable than `until`. for i in $$(seq 1 $$MAX_ATTEMPTS); do # Check if the bucket exists. If so, we're done. if /usr/bin/mc ls minio/airflow-logs > /dev/null 2>&1; then echo 'MinIO bucket already exists.' SUCCESS=true break fi # If not, try to create it. If successful, we're done. # We redirect output because `mc mb` can error if another process creates it in the meantime. if /usr/bin/mc mb minio/airflow-logs > /dev/null 2>&1; then echo 'MinIO bucket created.' SUCCESS=true break fi # If we reach here, both checks failed. Wait and retry. echo "Attempt $$i/$$MAX_ATTEMPTS: Waiting for MinIO bucket..." sleep 2 done # After the loop, check if we succeeded. if [ "$$SUCCESS" = "false" ]; then echo "Failed to create MinIO bucket after $$MAX_ATTEMPTS attempts." exit 1 fi /usr/bin/mc anonymous set download minio/airflow-logs; echo 'MinIO initialized: bucket airflow-logs created and policy set to download.'; " env_file: - .env environment: MINIO_ROOT_USER: ${{ '{' }}MINIO_ROOT_USER:-admin{{ '}' }} MINIO_ROOT_PASSWORD: ${{ '{' }}MINIO_ROOT_PASSWORD:-0153093693-0009{{ '}' }} restart: on-failure nginx-healthcheck: image: nginx:alpine container_name: nginx-healthcheck networks: - proxynet ports: - "8888:80" restart: always airflow-webserver: <<: *airflow-common command: webserver ports: - "8080:8080" healthcheck: test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] interval: 30s timeout: 10s retries: 5 start_period: 30s restart: always depends_on: <<: *airflow-common-depends-on airflow-init: condition: service_completed_successfully airflow-scheduler: <<: *airflow-common command: scheduler healthcheck: test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] interval: 30s timeout: 10s retries: 5 start_period: 30s restart: always depends_on: <<: *airflow-common-depends-on airflow-init: condition: service_completed_successfully airflow-master-worker: <<: *airflow-common command: airflow celery worker -q main,default healthcheck: # yamllint disable rule:line-length test: - "CMD-SHELL" - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-master@$$(hostname)"' interval: 30s timeout: 10s retries: 5 start_period: 30s environment: <<: *airflow-common-env # Required to handle warm shutdown of the celery workers properly # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation DUMB_INIT_SETSID: 0 AIRFLOW__CELERY__WORKER_QUEUES: "main,default" AIRFLOW__CELERY__WORKER_TAGS: "master" AIRFLOW__CELERY__WORKER_CONCURRENCY: "16" AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1" AIRFLOW__CELERY__TASK_ACKS_LATE: "False" AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0" AIRFLOW__CELERY__WORKER_NAME: "worker-master@%h" AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100" # Max memory per child process before it's recycled. Helps prevent memory leaks. # 256MB is sufficient for master worker tasks. DL workers use a higher limit. AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "262144" # 256MB hostname: ${{ '{' }}HOSTNAME{{ '}' }} restart: always depends_on: <<: *airflow-common-depends-on airflow-init: condition: service_completed_successfully airflow-triggerer: <<: *airflow-common command: triggerer healthcheck: test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${{ '{' }}HOSTNAME{{ '}' }}"'] interval: 30s timeout: 10s retries: 5 start_period: 30s restart: always depends_on: <<: *airflow-common-depends-on airflow-init: condition: service_completed_successfully airflow-init: <<: *airflow-common depends_on: <<: *airflow-common-depends-on minio-init: condition: service_completed_successfully redis-proxy-account-clear: condition: service_completed_successfully entrypoint: /bin/bash # yamllint disable rule:line-length command: - -c - | # This container runs as root and is responsible for initializing the environment. # It sets permissions on mounted directories to ensure the 'airflow' user (running with AIRFLOW_UID) # can write to them. This is crucial for logs, dags, and plugins. echo "Initializing permissions for Airflow directories..." chown -R "${{ '{' }}AIRFLOW_UID{{ '}' }}:${{ '{' }}AIRFLOW_GID{{ '}' }}" /opt/airflow/dags /opt/airflow/logs /opt/airflow/plugins /opt/airflow/config /opt/airflow/downloadfiles /opt/airflow/addfiles /opt/airflow/inputfiles echo "Permissions set." if [[ -z "${{ '{' }}AIRFLOW_UID{{ '}' }}" ]]; then echo echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" echo "If you are on Linux, you SHOULD follow the instructions below to set " echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." echo "For other operating systems you can get rid of the warning with manually created .env file:" echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" echo fi # This container's job is to initialize the database, create a user, and import connections. # Wait for db to be ready. airflow db check --retry 30 --retry-delay 5 # Run database migrations. echo "Running database migrations..." airflow db upgrade echo "Database migrations complete." # Create the admin user if it doesn't exist. # The '|| true' prevents the script from failing if the user already exists. echo "Checking for and creating admin user..." airflow users create \ --username "admin" \ --password "${{ '{' }}AIRFLOW_ADMIN_PASSWORD:-admin_pwd_X9yZ3aB1cE5dF7gH{{ '}' }}" \ --firstname Admin \ --lastname User \ --role Admin \ --email admin@example.com || true echo "Admin user check/creation complete." # Import connections from any .json file in the config directory. echo "Searching for connection files in /opt/airflow/config..." if [ -d "/opt/airflow/config" ] && [ -n "$(ls -A /opt/airflow/config/*.json 2>/dev/null)" ]; then for conn_file in /opt/airflow/config/*.json; do if [ -f "$$conn_file" ]; then # Exclude files that are not meant to be Airflow connections. if [ "$(basename "$$conn_file")" = "camoufox_endpoints.json" ]; then echo "Skipping '$$conn_file' as it is not an Airflow connection file." continue fi echo "Importing connections from $$conn_file" airflow connections import "$$conn_file" || echo "Failed to import $$conn_file, but continuing." fi done else echo "No connection files found to import, or /opt/airflow/config is empty/missing." fi echo "Connection import process complete." # yamllint enable rule:line-length environment: <<: *airflow-common-env _AIRFLOW_DB_MIGRATE: 'true' _AIRFLOW_WWW_USER_CREATE: 'false' # Set to false as we handle it manually _PIP_ADDITIONAL_REQUIREMENTS: '' user: "0:0" airflow-cli: <<: *airflow-common profiles: - debug environment: <<: *airflow-common-env CONNECTION_CHECK_MAX_COUNT: "0" # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 command: - bash - -c - airflow # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up # or by explicitly targeted on the command line e.g. docker-compose up flower. # See: https://docs.docker.com/compose/profiles/ flower: <<: *airflow-common command: celery flower ports: - "5555:5555" healthcheck: test: ["CMD", "curl", "--fail", "http://localhost:5555/"] interval: 30s timeout: 10s retries: 5 start_period: 30s restart: always depends_on: <<: *airflow-common-depends-on airflow-init: condition: service_completed_successfully docker-socket-proxy: profiles: - disabled image: tecnativa/docker-socket-proxy:0.1.1 networks: - proxynet environment: CONTAINERS: 1 IMAGES: 1 AUTH: 1 POST: 1 privileged: true volumes: - /var/run/docker.sock:/var/run/docker.sock:ro restart: always volumes: postgres-db-volume: networks: proxynet: name: airflow_proxynet external: true