yt-dlp-dags/airflow/bak/docker-compose-master.yaml
2025-08-26 18:00:55 +03:00

534 lines
19 KiB
YAML

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
#
# WARNING: This configuration is for local development. Do not use it in a production deployment.
#
# This configuration supports basic configuration using environment variables or an .env file
# The following variables are supported:
#
# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
# Default: apache/airflow:2.10.5
# AIRFLOW_UID - User ID in Airflow containers
# Default: 50000
# AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed.
# Default: .
# Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
#
# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
# Default: airflow
# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
# Default: airflow
# _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
# Use this option ONLY for quick checks. Installing requirements at container
# startup is done EVERY TIME the service is started.
# A better way is to build a custom image or extend the official image
# as described in https://airflow.apache.org/docs/docker-stack/build.html.
# Default: ''
#
# Feel free to modify this file to suit your needs.
---
name: airflow-master
x-minio-common: &minio-common
image: quay.io/minio/minio:RELEASE.2025-07-23T15-54-02Z
command: server --console-address ":9001" http://minio{1...3}/data{1...2}
expose:
- "9000"
- "9001"
networks:
- proxynet
env_file:
- .env
environment:
MINIO_ROOT_USER: ${MINIO_ROOT_USER:-admin}
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-0153093693-0009}
healthcheck:
test: ["CMD", "mc", "ready", "local"]
interval: 5s
timeout: 5s
retries: 5
restart: always
x-airflow-common:
&airflow-common
# In order to add custom dependencies or upgrade provider packages you can use your extended image.
# This will build the image from the Dockerfile in this directory and tag it.
image: ${AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest}
build: .
# Add extra hosts here to allow the master services (webserver, scheduler) to resolve
# the hostnames of your remote DL workers. This is crucial for fetching logs.
# Format: - "hostname:ip_address"
# IMPORTANT: This section is auto-generated from cluster.yml
extra_hosts:
- "af-test:89.253.223.97"
- "dl001:109.107.189.106"
env_file:
- .env
networks:
- proxynet
environment:
&airflow-common-env
AIRFLOW__CORE__PARALLELISM: 64
AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG: 32
AIRFLOW__SCHEDULER__PARSING_PROCESSES: 4
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}@postgres/airflow
AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}@postgres/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT}@redis:6379/0
AIRFLOW__CORE__FERNET_KEY: ''
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
# yamllint disable rule:line-length
# Use simple http server on scheduler for health checks
# See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
# yamllint enable rule:line-length
AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
# WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
# for other purpose (development, test and especially production usage) build/extend Airflow image.
#_PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker apache-airflow-providers-http thrift>=0.16.0,<=0.20.0 backoff>=2.2.1 python-dotenv==1.0.1 psutil>=5.9.0} # The following line can be used to set a custom config file, stored in the local config folder
# If you want to use it, outcomment it and replace airflow.cfg with the name of your config file
AIRFLOW__LOGGING__REMOTE_LOGGING: "True"
AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs"
AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default
AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False"
AIRFLOW__LOGGING__REMOTE_LOG_FORMAT: "[%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s"
AIRFLOW__LOGGING__LOG_LEVEL: "INFO"
AIRFLOW__LOGGING__LOG_FILENAME_TEMPLATE: "{{ ti.dag_id }}/{{ ti.run_id }}/{{ ti.task_id }}/attempt={{ try_number }}.log"
AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py"
volumes:
- ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
- ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
- ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
- ${AIRFLOW_PROJ_DIR:-.}/downloadfiles:/opt/airflow/downloadfiles
- ${AIRFLOW_PROJ_DIR:-.}/addfiles:/opt/airflow/addfiles
- ${AIRFLOW_PROJ_DIR:-.}/inputfiles:/opt/airflow/inputfiles
user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-0}"
depends_on:
&airflow-common-depends-on
redis:
condition: service_healthy
postgres:
condition: service_healthy
nginx-minio-lb:
condition: service_healthy
services:
postgres:
image: postgres:13
env_file:
- .env
networks:
- proxynet
environment:
POSTGRES_USER: airflow
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-pgdb_pwd_A7bC2xY9zE1wV5uP}
POSTGRES_DB: airflow
volumes:
- postgres-db-volume:/var/lib/postgresql/data
ports:
- "5432:5432"
healthcheck:
test: ["CMD", "pg_isready", "-U", "airflow"]
interval: 10s
retries: 5
start_period: 5s
restart: always
redis:
# Redis is limited to 7.2-bookworm due to licencing change
# https://redis.io/blog/redis-adopts-dual-source-available-licensing/
image: redis:7.2-bookworm
env_file:
- .env
networks:
- proxynet
command: sh -c "redis-server --requirepass ${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} --bind 0.0.0.0 --save 60 1 --loglevel warning --appendonly yes"
volumes:
- ./redis-data:/data
expose:
- 6379
ports:
- "52909:6379"
healthcheck:
test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT}", "ping"]
interval: 10s
timeout: 30s
retries: 50
start_period: 30s
restart: always
redis-proxy-account-clear:
image: redis:7.2-bookworm
container_name: redis-proxy-account-clear
env_file:
- .env
networks:
- proxynet
command: >
sh -c "
echo 'Clearing proxy and account statuses from Redis...';
redis-cli -h redis -a $${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} --scan --pattern 'proxy_status:*' | xargs -r redis-cli -h redis -a $${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} DEL;
redis-cli -h redis -a $${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} --scan --pattern 'account_status:*' | xargs -r redis-cli -h redis -a $${REDIS_PASSWORD:-redis_pwd_K3fG8hJ1mN5pQ2sT} DEL;
echo 'Redis cleanup complete.'
"
depends_on:
redis:
condition: service_healthy
minio1:
<<: *minio-common
hostname: minio1
volumes:
- ./minio-data/1/1:/data1
- ./minio-data/1/2:/data2
minio2:
<<: *minio-common
hostname: minio2
volumes:
- ./minio-data/2/1:/data1
- ./minio-data/2/2:/data2
depends_on:
minio1:
condition: service_started
minio3:
<<: *minio-common
hostname: minio3
volumes:
- ./minio-data/3/1:/data1
- ./minio-data/3/2:/data2
depends_on:
minio2:
condition: service_started
nginx-minio-lb:
image: nginx:1.19.2-alpine
hostname: nginx-minio-lb
networks:
- proxynet
command: sh -c "apk add --no-cache curl >/dev/null 2>&1 && exec nginx -g 'daemon off;'"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
ports:
- "9000:9000"
- "9001:9001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9001/minio/health/live"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
depends_on:
minio1:
condition: service_healthy
minio2:
condition: service_healthy
minio3:
condition: service_healthy
restart: always
minio-init:
image: minio/mc
container_name: minio-init
networks:
- proxynet
depends_on:
nginx-minio-lb:
condition: service_healthy
entrypoint: >
/bin/sh -c "
set -e;
/usr/bin/mc alias set minio http://nginx-minio-lb:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD;
# Retry loop for bucket creation
MAX_ATTEMPTS=10
SUCCESS=false
# Use a for loop for robustness, as it's generally more portable than `until`.
for i in $$(seq 1 $$MAX_ATTEMPTS); do
# Check if the bucket exists. If so, we're done.
if /usr/bin/mc ls minio/airflow-logs > /dev/null 2>&1; then
echo 'MinIO bucket already exists.'
SUCCESS=true
break
fi
# If not, try to create it. If successful, we're done.
# We redirect output because `mc mb` can error if another process creates it in the meantime.
if /usr/bin/mc mb minio/airflow-logs > /dev/null 2>&1; then
echo 'MinIO bucket created.'
SUCCESS=true
break
fi
# If we reach here, both checks failed. Wait and retry.
echo "Attempt $$i/$$MAX_ATTEMPTS: Waiting for MinIO bucket..."
sleep 2
done
# After the loop, check if we succeeded.
if [ "$$SUCCESS" = "false" ]; then
echo "Failed to create MinIO bucket after $$MAX_ATTEMPTS attempts."
exit 1
fi
/usr/bin/mc anonymous set download minio/airflow-logs;
echo 'MinIO initialized: bucket airflow-logs created and policy set to download.';
"
env_file:
- .env
environment:
MINIO_ROOT_USER: ${MINIO_ROOT_USER:-admin}
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-0153093693-0009}
restart: on-failure
nginx-healthcheck:
image: nginx:alpine
container_name: nginx-healthcheck
networks:
- proxynet
ports:
- "8888:80"
restart: always
airflow-webserver:
<<: *airflow-common
command: webserver
ports:
- "8080:8080"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-scheduler:
<<: *airflow-common
command: scheduler
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-master-worker:
<<: *airflow-common
command: airflow celery worker -q main,default
healthcheck:
# yamllint disable rule:line-length
test:
- "CMD-SHELL"
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-master@$$(hostname)"'
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
environment:
<<: *airflow-common-env
# Required to handle warm shutdown of the celery workers properly
# See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
DUMB_INIT_SETSID: 0
AIRFLOW__CELERY__WORKER_QUEUES: "main,default"
AIRFLOW__CELERY__WORKER_TAGS: "master"
AIRFLOW__CELERY__WORKER_CONCURRENCY: "16"
AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
AIRFLOW__CELERY__WORKER_NAME: "worker-master@%h"
AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
# Max memory per child process before it's recycled. Helps prevent memory leaks.
# 256MB is sufficient for master worker tasks. DL workers use a higher limit.
AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "262144" # 256MB
hostname: ${HOSTNAME}
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-triggerer:
<<: *airflow-common
command: triggerer
healthcheck:
test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-init:
<<: *airflow-common
depends_on:
<<: *airflow-common-depends-on
minio-init:
condition: service_completed_successfully
redis-proxy-account-clear:
condition: service_completed_successfully
entrypoint: /bin/bash
# yamllint disable rule:line-length
command:
- -c
- |
# This container runs as root and is responsible for initializing the environment.
# It sets permissions on mounted directories to ensure the 'airflow' user (running with AIRFLOW_UID)
# can write to them. This is crucial for logs, dags, and plugins.
echo "Initializing permissions for Airflow directories..."
chown -R "${AIRFLOW_UID}:${AIRFLOW_GID}" /opt/airflow/dags /opt/airflow/logs /opt/airflow/plugins /opt/airflow/config /opt/airflow/downloadfiles /opt/airflow/addfiles /opt/airflow/inputfiles
echo "Permissions set."
if [[ -z "${AIRFLOW_UID}" ]]; then
echo
echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
echo "If you are on Linux, you SHOULD follow the instructions below to set "
echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
echo "For other operating systems you can get rid of the warning with manually created .env file:"
echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
echo
fi
# This container's job is to initialize the database, create a user, and import connections.
# Wait for db to be ready.
airflow db check --retry 30 --retry-delay 5
# Run database migrations.
echo "Running database migrations..."
airflow db upgrade
echo "Database migrations complete."
# Create the admin user if it doesn't exist.
# The '|| true' prevents the script from failing if the user already exists.
echo "Checking for and creating admin user..."
airflow users create \
--username "admin" \
--password "${AIRFLOW_ADMIN_PASSWORD:-admin_pwd_X9yZ3aB1cE5dF7gH}" \
--firstname Admin \
--lastname User \
--role Admin \
--email admin@example.com || true
echo "Admin user check/creation complete."
# Import connections from any .json file in the config directory.
echo "Searching for connection files in /opt/airflow/config..."
if [ -d "/opt/airflow/config" ] && [ -n "$(ls -A /opt/airflow/config/*.json 2>/dev/null)" ]; then
for conn_file in /opt/airflow/config/*.json; do
if [ -f "$$conn_file" ]; then
# Exclude files that are not meant to be Airflow connections.
if [ "$(basename "$$conn_file")" = "camoufox_endpoints.json" ]; then
echo "Skipping '$$conn_file' as it is not an Airflow connection file."
continue
fi
echo "Importing connections from $$conn_file"
airflow connections import "$$conn_file" || echo "Failed to import $$conn_file, but continuing."
fi
done
else
echo "No connection files found to import, or /opt/airflow/config is empty/missing."
fi
echo "Connection import process complete."
# yamllint enable rule:line-length
environment:
<<: *airflow-common-env
_AIRFLOW_DB_MIGRATE: 'true'
_AIRFLOW_WWW_USER_CREATE: 'false' # Set to false as we handle it manually
_PIP_ADDITIONAL_REQUIREMENTS: ''
user: "0:0"
airflow-cli:
<<: *airflow-common
profiles:
- debug
environment:
<<: *airflow-common-env
CONNECTION_CHECK_MAX_COUNT: "0"
# Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
command:
- bash
- -c
- airflow
# You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
# or by explicitly targeted on the command line e.g. docker-compose up flower.
# See: https://docs.docker.com/compose/profiles/
flower:
<<: *airflow-common
command: celery flower
ports:
- "5555:5555"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
docker-socket-proxy:
profiles:
- disabled
image: tecnativa/docker-socket-proxy:0.1.1
networks:
- proxynet
environment:
CONTAINERS: 1
IMAGES: 1
AUTH: 1
POST: 1
privileged: true
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
restart: always
volumes:
postgres-db-volume:
networks:
proxynet:
name: airflow_proxynet
external: true