Update task pinning and default client list to tv_sample,mweb calls

This commit is contained in:
aperez 2025-09-22 15:57:43 +03:00
parent 4605e0e6a5
commit 52a2d6290d
17 changed files with 189 additions and 34 deletions

1
.gitignore vendored
View File

@ -1,2 +1 @@
.aider* .aider*
*/.DS_Store

View File

@ -1 +1 @@
3.6.1 3.10.1-exp

View File

@ -1,7 +1,26 @@
import logging import logging
import os
import sys
from copy import deepcopy from copy import deepcopy
from airflow.config_templates.airflow_local_settings import DEFAULT_LOGGING_CONFIG from airflow.config_templates.airflow_local_settings import DEFAULT_LOGGING_CONFIG
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Add the config directory to the path to allow for local imports like `custom_task_hooks`.
# This is necessary because this file is executed by the Airflow scheduler in a context
# where the config directory is not automatically on the Python path.
config_dir = os.path.dirname(os.path.abspath(__file__))
if config_dir not in sys.path:
sys.path.insert(0, config_dir)
logger.info(f"Added '{config_dir}' to sys.path for local imports.")
LOGGING_CONFIG = deepcopy(DEFAULT_LOGGING_CONFIG) LOGGING_CONFIG = deepcopy(DEFAULT_LOGGING_CONFIG)
# The task_instance_mutation_hook is now self-registering to be robust
# against different loading configurations. See custom_task_hooks.py for details.
try:
import custom_task_hooks
logger.info(f"Successfully imported custom_task_hooks module (Version: {getattr(custom_task_hooks, '__version__', 'unknown')}).")
except ImportError as e:
logger.warning(f"Could not import custom_task_hooks: {e}. Worker pinning will not function.", exc_info=True)

View File

@ -1,28 +1,55 @@
# Version: 2025-08-20-02 # Version: 2025-09-22-08
__version__ = "2025-09-22-08"
# This file contains custom hooks for the Airflow environment. # This file contains custom hooks for the Airflow environment.
import logging
from airflow import settings from airflow import settings
from airflow.models.dagrun import DagRun from airflow.configuration import conf
from airflow.utils.session import provide_session
logger = logging.getLogger(__name__)
@provide_session def task_instance_mutation_hook(ti):
def task_instance_mutation_hook(ti, session=None): """
This hook modifies the task instance queue at runtime for worker pinning.
It relies exclusively on parsing the queue from the run_id, which is guaranteed
to be set by the dispatcher DAG. This avoids database race conditions.
"""
logger.debug(f"MUTATION HOOK: Running for dag '{ti.dag_id}', task '{ti.task_id}'.")
if ti.dag_id == 'ytdlp_ops_worker_per_url': if ti.dag_id == 'ytdlp_ops_worker_per_url':
# Query the DagRun from the DB using run_id to reliably get the conf. # If the run_id isn't populated yet, just return. The hook may be called again.
# The ti.dag_run attribute is not always populated when the hook is called. if not ti.run_id:
dag_run = session.query(DagRun).filter(DagRun.run_id == ti.run_id).first() logger.debug(f"MUTATION HOOK: run_id not yet available for task '{ti.task_id}'. Skipping this invocation.")
conf = dag_run.conf if dag_run else {} return
worker_queue = conf.get('worker_queue')
logger.debug(f"MUTATION HOOK: Matched DAG '{ti.dag_id}'. Attempting to pin task '{ti.task_id}' for run_id '{ti.run_id}'.")
worker_queue = None
# The dispatcher embeds the queue in the run_id like: ..._q_queue-dl-worker-hostname
if ti.run_id and '_q_' in ti.run_id:
try:
parsed_queue = ti.run_id.split('_q_')[-1]
if parsed_queue.startswith('queue-dl-'):
worker_queue = parsed_queue
except Exception as e:
logger.error(f"MUTATION HOOK: CRITICAL: Error parsing queue from run_id '{ti.run_id}': {e}.", exc_info=True)
if worker_queue: if worker_queue:
print(f"MUTATION HOOK: For dag '{ti.dag_id}', pinning task '{ti.task_id}' (run_id: {ti.run_id}) to queue '{worker_queue}'.") logger.debug(f"MUTATION HOOK: Pinning task '{ti.task_id}' (run_id: {ti.run_id}) to queue '{worker_queue}' from run_id.")
ti.queue = worker_queue ti.queue = worker_queue
else: else:
print(f"MUTATION HOOK: For dag '{ti.dag_id}', no 'worker_queue' in conf for run_id '{ti.run_id}'. Falling back to 'queue-dl'.") # If the queue is not found, it's a critical failure in the dispatching logic.
# We fall back to the default queue but log it as a high-severity warning.
logger.warning(f"MUTATION HOOK: Could not find worker queue in run_id '{ti.run_id}'. Falling back to 'queue-dl'. Pinning will fail.")
ti.queue = 'queue-dl' ti.queue = 'queue-dl'
# Register the hook only in appropriate contexts
# This hook can cause issues with the Triggerer, which does not have a `dag_run` context # --- Hook Registration ---
# when it runs its own maintenance tasks. # This registration logic is placed here to work around environments where this file
if not settings.CONFIG.get('core', 'executor').lower().startswith('debug'): # might be loaded directly as the local settings file via AIRFLOW__CORE__LOCAL_SETTINGS_PATH.
try:
if not conf.get('core', 'executor').lower().startswith('debug'):
settings.task_instance_mutation_hook = task_instance_mutation_hook settings.task_instance_mutation_hook = task_instance_mutation_hook
logger.info(f"Successfully self-registered task_instance_mutation_hook (Version: {__version__}) for worker pinning.")
else:
logger.info("Skipping self-registration of task_instance_mutation_hook due to DebugExecutor.")
except Exception as e:
logger.warning(f"Could not self-register custom_task_hooks: {e}. Worker pinning may not function.", exc_info=True)

View File

@ -251,6 +251,8 @@ def dump_redis_data_to_csv(redis_client, dump_dir, patterns):
def clear_queue_callable(**context): def clear_queue_callable(**context):
"""Dumps Redis data to CSV and/or clears specified Redis keys based on selection.""" """Dumps Redis data to CSV and/or clears specified Redis keys based on selection."""
params = context['params'] params = context['params']
ti = context['task_instance']
logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
redis_conn_id = params['redis_conn_id'] redis_conn_id = params['redis_conn_id']
queue_base_name = params['queue_base_name'] queue_base_name = params['queue_base_name']
queues_to_clear_options = params.get('queues_to_clear_options', []) queues_to_clear_options = params.get('queues_to_clear_options', [])
@ -302,6 +304,8 @@ def clear_queue_callable(**context):
def list_contents_callable(**context): def list_contents_callable(**context):
"""Lists the contents of the specified Redis key(s) (list or hash).""" """Lists the contents of the specified Redis key(s) (list or hash)."""
params = context['params'] params = context['params']
ti = context['task_instance']
logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
redis_conn_id = params['redis_conn_id'] redis_conn_id = params['redis_conn_id']
queues_to_list_str = params.get('queue_to_list') queues_to_list_str = params.get('queue_to_list')
max_items = params.get('max_items', 10) max_items = params.get('max_items', 10)
@ -379,6 +383,8 @@ def list_contents_callable(**context):
def check_status_callable(**context): def check_status_callable(**context):
"""Checks the status (type and size) of all standard Redis queues for a given base name.""" """Checks the status (type and size) of all standard Redis queues for a given base name."""
params = context['params'] params = context['params']
ti = context['task_instance']
logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
redis_conn_id = params['redis_conn_id'] redis_conn_id = params['redis_conn_id']
queue_name = params.get('queue_base_name', DEFAULT_QUEUE_NAME) queue_name = params.get('queue_base_name', DEFAULT_QUEUE_NAME)
queue_suffixes = ['_inbox', '_progress', '_result', '_fail'] queue_suffixes = ['_inbox', '_progress', '_result', '_fail']
@ -414,6 +420,8 @@ def requeue_failed_callable(**context):
Copies all URLs from the fail hash to the inbox list and optionally clears the fail hash. Copies all URLs from the fail hash to the inbox list and optionally clears the fail hash.
""" """
params = context['params'] params = context['params']
ti = context['task_instance']
logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
redis_conn_id = params['redis_conn_id'] redis_conn_id = params['redis_conn_id']
queue_name = params['queue_base_name'] queue_name = params['queue_base_name']
clear_fail_queue = params['clear_fail_queue_after_requeue'] clear_fail_queue = params['clear_fail_queue_after_requeue']
@ -468,6 +476,8 @@ def add_videos_to_queue_callable(**context):
normalizes them to URLs, and adds them to a Redis queue. normalizes them to URLs, and adds them to a Redis queue.
""" """
params = context["params"] params = context["params"]
ti = context['task_instance']
logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
queue_name = params["queue_base_name"] queue_name = params["queue_base_name"]
redis_conn_id = params["redis_conn_id"] redis_conn_id = params["redis_conn_id"]
dry_run = params["dry_run"] dry_run = params["dry_run"]

View File

@ -30,6 +30,9 @@ def dispatch_url_to_worker(**context):
Pulls one URL from Redis, determines the current worker's dedicated queue, Pulls one URL from Redis, determines the current worker's dedicated queue,
and triggers the main worker DAG to process the URL on that specific queue. and triggers the main worker DAG to process the URL on that specific queue.
""" """
ti = context['task_instance']
logger.info(f"Dispatcher task '{ti.task_id}' running on queue '{ti.queue}'.")
# --- Check for worker pause lock file --- # --- Check for worker pause lock file ---
# This path must be consistent with the Ansible playbook. # This path must be consistent with the Ansible playbook.
lock_file_path = '/opt/airflow/inputfiles/AIRFLOW.PREVENT_URL_PULL.lockfile' lock_file_path = '/opt/airflow/inputfiles/AIRFLOW.PREVENT_URL_PULL.lockfile'
@ -65,7 +68,9 @@ def dispatch_url_to_worker(**context):
# We add the specific URL and the determined worker queue to the configuration. # We add the specific URL and the determined worker queue to the configuration.
conf_to_pass = {**params, 'url_to_process': url_to_process, 'worker_queue': worker_queue} conf_to_pass = {**params, 'url_to_process': url_to_process, 'worker_queue': worker_queue}
run_id = f"worker_run_{context['dag_run'].run_id}_{context['ts_nodash']}" # Embed the worker queue in the run_id to avoid DB race conditions in the mutation hook.
# The hook will parse the queue name from the run_id itself.
run_id = f"worker_run_{context['dag_run'].run_id}_{context['ts_nodash']}_q_{worker_queue}"
logger.info(f"Triggering 'ytdlp_ops_worker_per_url' with run_id '{run_id}'") logger.info(f"Triggering 'ytdlp_ops_worker_per_url' with run_id '{run_id}'")
trigger_dag( trigger_dag(

View File

@ -96,12 +96,15 @@ def orchestrate_workers_ignition_callable(**context):
to initiate self-sustaining processing loops. to initiate self-sustaining processing loops.
""" """
params = context['params'] params = context['params']
ti = context['task_instance']
logger.info(f"Orchestrator task '{ti.task_id}' running on queue '{ti.queue}'.")
logger.info("Starting dispatcher ignition sequence.") logger.info("Starting dispatcher ignition sequence.")
dispatcher_dag_id = 'ytdlp_ops_dispatcher' dispatcher_dag_id = 'ytdlp_ops_dispatcher'
dag_model = DagModel.get_dagmodel(dispatcher_dag_id) dag_model = DagModel.get_dagmodel(dispatcher_dag_id)
if dag_model and dag_model.is_paused: if dag_model and dag_model.is_paused:
raise AirflowException(f"Dispatcher DAG '{dispatcher_dag_id}' is paused. Cannot start dispatcher loops.") logger.warning(f"Dispatcher DAG '{dispatcher_dag_id}' is paused. Skipping dispatcher ignition.")
raise AirflowSkipException(f"Dispatcher DAG '{dispatcher_dag_id}' is paused.")
total_workers = int(params['total_workers']) total_workers = int(params['total_workers'])
workers_per_bunch = int(params['workers_per_bunch']) workers_per_bunch = int(params['workers_per_bunch'])
@ -264,7 +267,7 @@ with DAG(
), ),
'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."), 'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."),
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."), 'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
'clients': Param('web', type="string", description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_sample, tv_embedded"), 'clients': Param('tv_sample,mweb,web_camoufox', type="string", description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_sample, tv_embedded"),
'account_pool': Param('ytdlp_account', type="string", description="[Worker Param] Account pool prefix or comma-separated list."), 'account_pool': Param('ytdlp_account', type="string", description="[Worker Param] Account pool prefix or comma-separated list."),
'account_pool_size': Param(10, type=["integer", "null"], description="[Worker Param] If using a prefix for 'account_pool', this specifies the number of accounts to generate (e.g., 10 for 'prefix_01' through 'prefix_10'). Required when using a prefix."), 'account_pool_size': Param(10, type=["integer", "null"], description="[Worker Param] If using a prefix for 'account_pool', this specifies the number of accounts to generate (e.g., 10 for 'prefix_01' through 'prefix_10'). Required when using a prefix."),
'service_ip': Param(DEFAULT_YT_AUTH_SERVICE_IP, type="string", description="[Worker Param] IP of the ytdlp-ops-server. Default is from Airflow variable YT_AUTH_SERVICE_IP or hardcoded."), 'service_ip': Param(DEFAULT_YT_AUTH_SERVICE_IP, type="string", description="[Worker Param] IP of the ytdlp-ops-server. Default is from Airflow variable YT_AUTH_SERVICE_IP or hardcoded."),

View File

@ -55,8 +55,8 @@ DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080) DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
# The queue is set to a fallback here. The actual worker-specific queue is # The queue is set to a fallback here. The actual worker-specific queue is
# assigned just-in-time by the task_instance_mutation_hook in airflow_local_settings.py, # assigned just-in-time by the task_instance_mutation_hook (see: airflow/config/custom_task_hooks.py),
# which reads the 'worker_queue' from the DAG run configuration. # which parses the target queue from the DAG run_id.
DEFAULT_ARGS = { DEFAULT_ARGS = {
'owner': 'airflow', 'owner': 'airflow',
'retries': 0, 'retries': 0,
@ -151,6 +151,32 @@ def get_url_and_assign_account(**context):
This is the first task in the pinned-worker DAG. This is the first task in the pinned-worker DAG.
""" """
params = context['params'] params = context['params']
ti = context['task_instance']
# --- Worker Pinning Verification ---
# This is a safeguard against a known Airflow issue where clearing a task
# can cause the task_instance_mutation_hook to be skipped, breaking pinning.
# See: https://github.com/apache/airflow/issues/20143
expected_queue = None
if ti.run_id and '_q_' in ti.run_id:
expected_queue = ti.run_id.split('_q_')[-1]
if not expected_queue:
# Fallback to conf if run_id parsing fails for some reason
expected_queue = params.get('worker_queue')
if expected_queue and ti.queue != expected_queue:
error_msg = (
f"WORKER PINNING FAILURE: Task is running on queue '{ti.queue}' but was expected on '{expected_queue}'. "
"This usually happens after manually clearing a task, which is not the recommended recovery method for this DAG. "
"To recover a failed URL, let the DAG run fail, use the 'ytdlp_mgmt_queues' DAG to requeue the URL, "
"and use the 'ytdlp_ops_orchestrator' to start a new worker loop if needed."
)
logger.error(error_msg)
raise AirflowException(error_msg)
elif expected_queue:
logger.info(f"Worker pinning verified. Task is correctly running on queue '{ti.queue}'.")
# --- End Verification ---
# The URL is passed by the dispatcher DAG. # The URL is passed by the dispatcher DAG.
url_to_process = params.get('url_to_process') url_to_process = params.get('url_to_process')

View File

@ -3,7 +3,8 @@
hosts: airflow_master hosts: airflow_master
gather_facts: no gather_facts: no
vars_files: vars_files:
- group_vars/all.yml - "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
- "{{ inventory_dir }}/group_vars/all/vault.yml"
tasks: tasks:
- name: Sync DAGs to MASTER server - name: Sync DAGs to MASTER server
ansible.posix.synchronize: ansible.posix.synchronize:
@ -33,7 +34,8 @@
hosts: airflow_workers hosts: airflow_workers
gather_facts: no gather_facts: no
vars_files: vars_files:
- group_vars/all.yml - "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
- "{{ inventory_dir }}/group_vars/all/vault.yml"
tasks: tasks:
- name: Sync DAGs to WORKER server - name: Sync DAGs to WORKER server
ansible.posix.synchronize: ansible.posix.synchronize:

0
ansible/playbook-dl.yml Normal file
View File

View File

@ -3,8 +3,8 @@
hosts: all hosts: all
gather_facts: true gather_facts: true
vars_files: vars_files:
- group_vars/all.yml - "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
- group_vars/all/vault.yml - "{{ inventory_dir }}/group_vars/all/vault.yml"
pre_tasks: pre_tasks:
- name: Announce fast deploy mode if enabled - name: Announce fast deploy mode if enabled
debug: debug:

53
ansible/playbook-hook.yml Normal file
View File

@ -0,0 +1,53 @@
---
- name: Deploy and Reload Airflow Task Hook
hosts: all
gather_facts: no
vars_files:
- "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
- "{{ inventory_dir }}/group_vars/all/vault.yml"
tasks:
- name: Sync custom_task_hooks.py to MASTER server
when: inventory_hostname in groups['airflow_master']
synchronize:
src: "../airflow/config/custom_task_hooks.py"
dest: "{{ airflow_master_dir }}/config/"
archive: yes
rsync_path: "sudo rsync"
- name: Sync airflow_local_settings.py to MASTER server
when: inventory_hostname in groups['airflow_master']
synchronize:
src: "../airflow/config/airflow_local_settings.py"
dest: "{{ airflow_master_dir }}/config/"
archive: yes
rsync_path: "sudo rsync"
- name: Sync custom_task_hooks.py to WORKER server
when: inventory_hostname in groups['airflow_workers']
synchronize:
src: "../airflow/config/custom_task_hooks.py"
dest: "{{ airflow_worker_dir }}/config/"
archive: yes
rsync_path: "sudo rsync"
- name: Sync airflow_local_settings.py to WORKER server
when: inventory_hostname in groups['airflow_workers']
synchronize:
src: "../airflow/config/airflow_local_settings.py"
dest: "{{ airflow_worker_dir }}/config/"
archive: yes
rsync_path: "sudo rsync"
- name: Restart Airflow services on MASTER
when: inventory_hostname in groups['airflow_master']
ansible.builtin.command:
cmd: "docker compose restart airflow-scheduler airflow-webserver airflow-master-worker airflow-triggerer"
chdir: "{{ airflow_master_dir }}"
become: yes
- name: Restart Airflow worker on WORKER
when: inventory_hostname in groups['airflow_workers']
ansible.builtin.command:
cmd: "docker compose restart airflow-worker"
chdir: "{{ airflow_worker_dir }}"
become: yes

View File

@ -3,8 +3,8 @@
hosts: airflow_master hosts: airflow_master
gather_facts: yes gather_facts: yes
vars_files: vars_files:
- group_vars/all.yml - "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
- group_vars/all/vault.yml - "{{ inventory_dir }}/group_vars/all/vault.yml"
pre_tasks: pre_tasks:
- name: Announce master deployment - name: Announce master deployment
debug: debug:

View File

@ -3,8 +3,8 @@
hosts: airflow_workers hosts: airflow_workers
gather_facts: yes gather_facts: yes
vars_files: vars_files:
- group_vars/all.yml - "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
- group_vars/all/vault.yml - "{{ inventory_dir }}/group_vars/all/vault.yml"
pre_tasks: pre_tasks:
- name: Announce worker deployment - name: Announce worker deployment
debug: debug:

View File

@ -2,7 +2,8 @@
- hosts: airflow_workers - hosts: airflow_workers
gather_facts: no gather_facts: no
vars_files: vars_files:
- ../group_vars/all.yml - "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
- "{{ inventory_dir }}/group_vars/all/vault.yml"
tasks: tasks:
- name: "Create lock file to pause worker" - name: "Create lock file to pause worker"
file: file:

View File

@ -2,7 +2,8 @@
- hosts: airflow_workers - hosts: airflow_workers
gather_facts: yes gather_facts: yes
vars_files: vars_files:
- ../group_vars/all.yml - "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
- "{{ inventory_dir }}/group_vars/all/vault.yml"
tasks: tasks:
- name: "Archive lock file to resume worker" - name: "Archive lock file to resume worker"
command: > command: >

9
deploy.sh Normal file
View File

@ -0,0 +1,9 @@
#!/bin/bash
set -e
echo "Generating inventory..."
./tools/generate-inventory.py cluster.test.yml
echo "Deploying full cluster..."
cd ansible
ansible-playbook playbook-full.yml