# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. # # WARNING: This configuration is for local development. Do not use it in a production deployment. # # This configuration supports basic configuration using environment variables or an .env file # The following variables are supported: # # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. # Default: apache/airflow:2.10.5 # AIRFLOW_UID - User ID in Airflow containers # Default: 50000 # AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed. # Default: . # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode # # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). # Default: airflow # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). # Default: airflow # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. # Use this option ONLY for quick checks. Installing requirements at container # startup is done EVERY TIME the service is started. # A better way is to build a custom image or extend the official image # as described in https://airflow.apache.org/docs/docker-stack/build.html. # Default: '' # # Feel free to modify this file to suit your needs. --- name: airflow-master x-airflow-common: &airflow-common # In order to add custom dependencies or upgrade provider packages you can use your extended image. # This will build the image from the Dockerfile in this directory and tag it. image: ${{ '{' }}AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest{{ '}' }} build: . # Add extra hosts here to allow the master services (webserver, scheduler) to resolve # the hostnames of your remote DL workers. This is crucial for fetching logs. # Format: - "hostname:ip_address" # This section is auto-generated by Ansible from the inventory. extra_hosts: {% for host in groups['all'] %} - "{{ hostvars[host]['inventory_hostname'] }}:{{ hostvars[host]['ansible_host'] }}" {% endfor %} env_file: # The .env file is located in the project root, one level above the 'configs' directory. - ".env" networks: - proxynet environment: &airflow-common-env AIRFLOW__CORE__PARALLELISM: 128 AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG: 64 AIRFLOW__SCHEDULER__PARSING_PROCESSES: 4 AIRFLOW__WEBSERVER__WORKER_CLASS: sync AIRFLOW__WEBSERVER__WORKERS: 8 AIRFLOW__LOGGING__SECRET_MASK_EXCEPTION_ARGS: 'false' # Prevent slow webserver when low memory? GUNICORN_CMD_ARGS: --worker-tmp-dir /dev/shm AIRFLOW__CORE__EXECUTOR: CeleryExecutor # For master services, connect to Postgres and Redis using internal Docker service names. # Passwords are sourced from the .env file. AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${{ '{' }}POSTGRES_PASSWORD{{ '}' }}@postgres:5432/airflow AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql+psycopg2://airflow:${{ '{' }}POSTGRES_PASSWORD{{ '}' }}@postgres:5432/airflow AIRFLOW__CELERY__BROKER_URL: redis://:${{ '{' }}REDIS_PASSWORD{{ '}' }}@redis:6379/0 AIRFLOW__CORE__FERNET_KEY: '' AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' AIRFLOW__CORE__LOAD_EXAMPLES: 'false' AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ==' AIRFLOW__WEBSERVER__WORKER_TIMEOUT: '120' AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZZQ==' # yamllint disable rule:line-length # Use simple http server on scheduler for health checks # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server # yamllint enable rule:line-length AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS: 'false' AIRFLOW__LOGGING__REMOTE_LOGGING: 'true' AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs" AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default AIRROW__LOGGING__ENCRYPT_S3_LOGS: 'false' AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py" volumes: - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/dags:/opt/airflow/dags - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/logs:/opt/airflow/logs - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/config:/opt/airflow/config - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/config/airflow.cfg:/opt/airflow/airflow.cfg - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/plugins:/opt/airflow/plugins - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/downloadfiles:/opt/airflow/downloadfiles - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/addfiles:/opt/airflow/addfiles - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/inputfiles:/opt/airflow/inputfiles user: "${{ '{' }}AIRFLOW_UID:-50000{{ '}' }}:0" depends_on: &airflow-common-depends-on redis: condition: service_healthy postgres: condition: service_healthy minio-init: condition: service_completed_successfully services: postgres: image: postgres:13 env_file: - .env networks: - proxynet environment: POSTGRES_USER: airflow POSTGRES_PASSWORD: ${{ '{' }}POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP{{ '}' }} POSTGRES_DB: airflow command: - "postgres" - "-c" - "shared_buffers=512MB" - "-c" - "effective_cache_size=1536MB" volumes: - ./postgres-data:/var/lib/postgresql/data ports: - "{{ postgres_port }}:5432" healthcheck: test: ["CMD", "pg_isready", "-U", "airflow"] interval: 10s retries: 5 start_period: 5s restart: always redis: # Redis is limited to 7.2-bookworm due to licencing change # https://redis.io/blog/redis-adopts-dual-source-available-licensing/ image: redis:7.2-bookworm env_file: - .env networks: - proxynet command: - "redis-server" - "--requirepass" - "${{ '{' }}REDIS_PASSWORD:-rOhTAIlTFFylXsjhqwxnYxDChFc{{ '}' }}" - "--bind" - "*" - "--protected-mode" - "no" - "--save" - "60" - "1" - "--loglevel" - "warning" - "--appendonly" - "yes" volumes: - redis-data:/data expose: - 6379 ports: - "{{ redis_port }}:6379" healthcheck: test: ["CMD", "redis-cli", "-a", "${{ '{' }}REDIS_PASSWORD:-rOhTAIlTFFylXsjhqwxnYxDChFc{{ '}' }}", "ping"] interval: 10s timeout: 30s retries: 50 start_period: 30s restart: always sysctls: - net.core.somaxconn=1024 ulimits: memlock: -1 redis-proxy-account-clear: image: redis:7.2-bookworm container_name: redis-proxy-account-clear env_file: - .env networks: - proxynet command: > sh -c " echo 'Clearing proxy and account statuses from Redis...'; redis-cli -h redis -a $${{ '{' }}REDIS_PASSWORD:-rOhTAIlTFFylXsjhqwxnYxDChFc{{ '}' }} --scan --pattern 'proxy_status:*' | xargs -r redis-cli -h redis -a $${{ '{' }}REDIS_PASSWORD:-rOhTAIlTFFylXsjhqwxnYxDChFc{{ '}' }} DEL; redis-cli -h redis -a $${{ '{' }}REDIS_PASSWORD:-rOhTAIlTFFylXsjhqwxnYxDChFc{{ '}' }} --scan --pattern 'account_status:*' | xargs -r redis-cli -h redis -a $${{ '{' }}REDIS_PASSWORD:-rOhTAIlTFFylXsjhqwxnYxDChFc{{ '}' }} DEL; echo 'Redis cleanup complete.' " depends_on: redis: condition: service_healthy minio: image: minio/minio:latest container_name: minio networks: - proxynet volumes: - ./minio-data:/data ports: - "9001:9000" - "9002:9001" environment: MINIO_ROOT_USER: ${{ '{' }}MINIO_ROOT_USER:-admin{{ '}' }} MINIO_ROOT_PASSWORD: ${{ '{' }}MINIO_ROOT_PASSWORD:-0153093693-0009{{ '}' }} command: server /data --console-address ":9001" healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] interval: 30s timeout: 20s retries: 3 restart: always nginx-minio-lb: image: nginx:alpine container_name: nginx-minio-lb networks: - proxynet ports: - "9000:9000" volumes: - ./configs/nginx.conf:/etc/nginx/nginx.conf:ro depends_on: minio: condition: service_healthy healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] interval: 30s timeout: 10s retries: 5 restart: always minio-init: image: minio/mc container_name: minio-init networks: - proxynet depends_on: nginx-minio-lb: condition: service_healthy entrypoint: > /bin/sh -c " set -e; /usr/bin/mc alias set minio http://nginx-minio-lb:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD; # Retry loop for bucket creation MAX_ATTEMPTS=10 SUCCESS=false # Use a for loop for robustness, as it's generally more portable than `until`. for i in $$(seq 1 $$MAX_ATTEMPTS); do # Check if the bucket exists. If so, we're done. if /usr/bin/mc ls minio/airflow-logs > /dev/null 2>&1; then echo 'MinIO bucket already exists.' SUCCESS=true break fi # If not, try to create it. If successful, we're done. # We redirect output because `mc mb` can error if another process creates it in the meantime. if /usr/bin/mc mb minio/airflow-logs > /dev/null 2>&1; then echo 'MinIO bucket created.' SUCCESS=true break fi # If we reach here, both checks failed. Wait and retry. echo "Attempt $$i/$$MAX_ATTEMPTS: Waiting for MinIO bucket..." sleep 2 done # After the loop, check if we succeeded. if [ "$$SUCCESS" = "false" ]; then echo "Failed to create MinIO bucket after $$MAX_ATTEMPTS attempts." exit 1 fi /usr/bin/mc anonymous set download minio/airflow-logs; echo 'MinIO initialized: bucket airflow-logs created and policy set to download.'; " env_file: - .env environment: MINIO_ROOT_USER: ${{ '{' }}MINIO_ROOT_USER:-admin{{ '}' }} MINIO_ROOT_PASSWORD: ${{ '{' }}MINIO_ROOT_PASSWORD:-0153093693-0009{{ '}' }} restart: on-failure caddy: build: context: . dockerfile: Dockerfile.caddy image: pangramia/ytdlp-ops-caddy:latest container_name: caddy networks: - proxynet ports: - "8080:8080" depends_on: airflow-webserver: condition: service_started restart: always airflow-webserver: <<: *airflow-common command: webserver expose: - "8080" environment: <<: *airflow-common-env healthcheck: test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] interval: 30s timeout: 30s retries: 5 start_period: 30s restart: always depends_on: <<: *airflow-common-depends-on airflow-init: condition: service_completed_successfully airflow-scheduler: <<: *airflow-common command: scheduler healthcheck: test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] interval: 30s timeout: 10s retries: 5 start_period: 30s restart: always depends_on: <<: *airflow-common-depends-on airflow-init: condition: service_completed_successfully airflow-master-worker: <<: *airflow-common command: airflow celery worker -q main,default healthcheck: # yamllint disable rule:line-length test: - "CMD-SHELL" - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-master@$$(hostname)"' interval: 30s timeout: 10s retries: 5 start_period: 30s environment: <<: *airflow-common-env # Required to handle warm shutdown of the celery workers properly # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation DUMB_INIT_SETSID: 0 AIRFLOW__CELERY__WORKER_QUEUES: "main,default" AIRFLOW__CELERY__WORKER_TAGS: "master" AIRFLOW__CELERY__WORKER_CONCURRENCY: "16" AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1" AIRFLOW__CELERY__TASK_ACKS_LATE: "True" AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0" AIRFLOW__CELERY__WORKER_NAME: "worker-master@%h" AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100" # Max memory per child process before it's recycled. Helps prevent memory leaks. # 256MB is sufficient for master worker tasks. DL workers use a higher limit. AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "262144" # 256MB hostname: ${{ '{' }}HOSTNAME{{ '}' }} restart: always depends_on: <<: *airflow-common-depends-on airflow-init: condition: service_completed_successfully airflow-triggerer: <<: *airflow-common command: triggerer hostname: ${{ '{' }}HOSTNAME{{ '}' }} environment: <<: *airflow-common-env PYTHONASYNCIODEBUG: "1" healthcheck: test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${{ '{' }}HOSTNAME{{ '}' }}"'] interval: 30s timeout: 10s retries: 5 start_period: 30s restart: always depends_on: <<: *airflow-common-depends-on airflow-init: condition: service_completed_successfully airflow-init: <<: *airflow-common depends_on: <<: *airflow-common-depends-on redis-proxy-account-clear: condition: service_completed_successfully entrypoint: /bin/bash # yamllint disable rule:line-length command: - -c - | # This container runs as root and is responsible for initializing the environment. # It sets permissions on mounted directories to ensure the 'airflow' user (running with AIRFLOW_UID) # can write to them. This is crucial for logs, dags, and plugins. echo "Creating scheduler & dag processor log directories..." mkdir -p /opt/airflow/logs/scheduler /opt/airflow/logs/dag_processor_manager echo "Initializing permissions for Airflow directories..." chown -R "${{ '{' }}AIRFLOW_UID{{ '}' }}:0" /opt/airflow/dags /opt/airflow/logs /opt/airflow/plugins /opt/airflow/config /opt/airflow/downloadfiles /opt/airflow/addfiles /opt/airflow/inputfiles echo "Setting group-writable and setgid permissions on logs directory..." find /opt/airflow/logs -type d -exec chmod g+rws {} + find /opt/airflow/logs -type f -exec chmod g+rw {} + echo "Permissions set." # Install curl and setup MinIO connection echo "Installing curl and setting up MinIO connection..." apt-get update -yqq && apt-get install -yqq curl echo "MinIO connection setup complete." if [[ -z "${{ '{' }}AIRFLOW_UID{{ '}' }}" ]]; then echo echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" echo "If you are on Linux, you SHOULD follow the instructions below to set " echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." echo "For other operating systems you can get rid of the warning with manually created .env file:" echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" echo fi # This container's job is to initialize the database, create a user, and import connections. # Wait for db to be ready. airflow db check --retry 30 --retry-delay 5 # Initialize the database if needed echo "Initializing Airflow database..." airflow db init echo "Database initialization complete." # Run database migrations. echo "Running database migrations..." airflow db upgrade echo "Database migrations complete." # Create the admin user if it doesn't exist. # The '|| true' prevents the script from failing if the user already exists. echo "Checking for and creating admin user..." airflow users create \ --username "admin" \ --password "${{ '{' }}AIRFLOW_ADMIN_PASSWORD:-admin_pwd_X9yZ3aB1cE5dF7gH{{ '}' }}" \ --firstname Admin \ --lastname User \ --role Admin \ --email admin@example.com || true echo "Admin user check/creation complete." # Create/update the redis_default connection to ensure password is correct echo "Creating/updating redis_default connection..." airflow connections add 'redis_default' \ --conn-uri "redis://:${{ '{' }}REDIS_PASSWORD{{ '}' }}@redis:6379/0" \ || echo "Failed to add redis_default connection, but continuing." echo "Redis connection setup complete." # Import connections from any .json file in the config directory. echo "Searching for connection files in /opt/airflow/config..." if [ -d "/opt/airflow/config" ] && [ -n "$(ls -A /opt/airflow/config/*.json 2>/dev/null)" ]; then for conn_file in /opt/airflow/config/*.json; do if [ -f "$$conn_file" ]; then # Exclude files that are not meant to be Airflow connections. if [ "$(basename "$$conn_file")" = "camoufox_endpoints.json" ]; then echo "Skipping '$$conn_file' as it is not an Airflow connection file." continue fi echo "Importing connections from $$conn_file" airflow connections import "$$conn_file" || echo "Failed to import $$conn_file, but continuing." fi done else echo "No connection files found to import, or /opt/airflow/config is empty/missing." fi echo "Connection import process complete." # yamllint enable rule:line-length environment: <<: *airflow-common-env _AIRFLOW_DB_MIGRATE: 'true' _AIRFLOW_WWW_USER_CREATE: 'false' # Set to false as we handle it manually user: "0:0" airflow-cli: <<: *airflow-common profiles: - debug environment: <<: *airflow-common-env CONNECTION_CHECK_MAX_COUNT: "0" # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 command: - bash - -c - airflow flower: <<: *airflow-common command: celery flower ports: - "5555:5555" environment: <<: *airflow-common-env FLOWER_BASIC_AUTH: "flower:${{ '{' }}FLOWER_PASSWORD{{ '}' }}" healthcheck: test: ["CMD", "curl", "--fail", "http://localhost:5555/"] interval: 30s timeout: 10s retries: 5 start_period: 30s restart: always depends_on: <<: *airflow-common-depends-on airflow-init: condition: service_completed_successfully docker-socket-proxy: profiles: - disabled image: tecnativa/docker-socket-proxy:0.1.1 networks: - proxynet environment: CONTAINERS: 1 IMAGES: 1 AUTH: 1 POST: 1 privileged: true volumes: - /var/run/docker.sock:/var/run/docker.sock:ro restart: always volumes: redis-data: networks: proxynet: name: airflow_proxynet external: true