Added changes on s3 uploaded and ansible changes tested on af-test machines, deprecation of camoufox run

This commit is contained in:
aperez 2025-11-23 17:23:51 +03:00
parent 3709ba6f81
commit 61873b46f9
29 changed files with 4974 additions and 193 deletions

2
.gitignore vendored
View File

@ -1,2 +1,2 @@
**/__pycache__/*
.aider*
__pycache__

View File

@ -50,6 +50,12 @@ RUN FFMPEG_URL="https://github.com/yt-dlp/FFmpeg-Builds/releases/download/latest
ln -sf /opt/ffmpeg/bin/ffprobe /usr/local/bin/ffprobe && \
rm -rf /tmp/ffmpeg.tar.xz
# Install s5cmd
RUN S5CMD_URL="https://github.com/peak/s5cmd/releases/download/v2.3.0/s5cmd_2.3.0_linux_amd64.deb" && \
wget -qO /tmp/s5cmd.deb "$S5CMD_URL" && \
dpkg -i /tmp/s5cmd.deb && \
rm /tmp/s5cmd.deb
# Install yt-dlp from master
# Temporarily rename pip to bypass the root check in the base image's pip wrapper,
# ensuring a system-wide installation.
@ -72,7 +78,7 @@ RUN wget -q https://github.com/ginuerzh/gost/releases/download/v2.12.0/gost_2.12
rm gost_2.12.0_linux_amd64.tar.gz
# Verify installations
RUN ffmpeg -version && deno --version && yt-dlp --version && aria2c --version && gost -V
RUN ffmpeg -version && deno --version && yt-dlp --version && aria2c --version && gost -V && s5cmd version
# Create version information files
RUN ( \
@ -107,7 +113,8 @@ RUN mv /usr/local/bin/pip /usr/local/bin/pip.orig && \
"ffprobe3" \
"python-dotenv" \
"PyYAML" \
"aria2p" && \
"aria2p" \
"s5cmdpy" && \
mv /usr/local/bin/pip.orig /usr/local/bin/pip
# --- Install the custom yt_ops_services package ---
@ -141,6 +148,7 @@ RUN mv /usr/local/bin/pip /usr/local/bin/pip.orig && \
# This fixes permission issues that can occur if previous RUN commands created files in /home/airflow as root.
# We also make it world-writable to accommodate running the container with a different user ID, which can
# happen in some environments (e.g., OpenShift or with docker-compose user overrides).
RUN mkdir -p /home/airflow/.aws && chown -R airflow:airflow /home/airflow/.aws
RUN chown -R airflow:airflow /home/airflow && chmod -R 777 /home/airflow
# Switch to airflow user for all subsequent operations

View File

@ -54,8 +54,8 @@ x-airflow-common:
# Remote Logging - connection is configured directly via environment variables
#_PIP_ADDITIONAL_REQUIREMENTS: ${{ '{' }}_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker apache-airflow-providers-http thrift>=0.16.0,<=0.20.0 backoff>=2.2.1 python-dotenv==1.0.1 psutil>=5.9.0 apache-airflow-providers-amazon{{ '}' }}
AIRFLOW__LOGGING__REMOTE_LOGGING: "True"
AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs"
AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default
AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://videos/airflow-logs"
AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: s3_delivery_connection
AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False"
#AIRFLOW__LOGGING__LOG_ID_TEMPLATE: "{dag_id}-{task_id}-{run_id}-{try_number}"
AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
@ -121,6 +121,51 @@ services:
- proxynet
restart: always
airflow-worker-s3:
<<: *airflow-common
container_name: airflow-worker-s3-1
hostname: ${HOSTNAME:-s3-001}
# The S3 worker listens on the generic s3 queue AND its own dedicated queue.
command: airflow celery worker -q queue-s3,queue-s3-${HOSTNAME:-s3-001}
deploy:
resources:
limits:
memory: ${AIRFLOW_WORKER_S3_MEM_LIMIT:-1G}
reservations:
memory: ${AIRFLOW_WORKER_S3_MEM_RESERV:-256M}
healthcheck:
test:
- "CMD-SHELL"
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-s3@$$(hostname)"'
interval: 30s
timeout: 30s
retries: 5
start_period: 30s
environment:
<<: *airflow-common-env
S3_DELIVERY_AWS_ACCESS_KEY_ID: "{{ vault_s3_delivery_access_key_id }}"
S3_DELIVERY_AWS_SECRET_ACCESS_KEY: "{{ vault_s3_delivery_secret_access_key }}"
S3_DELIVERY_AWS_REGION: "{{ vault_s3_delivery_aws_region }}"
S3_DELIVERY_ENDPOINT: "{{ vault_s3_delivery_endpoint }}"
S3_DELIVERY_BUCKET: "{{ vault_s3_delivery_bucket }}"
HOSTNAME: ${HOSTNAME:-s3-001}
DUMB_INIT_SETSID: "0"
AIRFLOW__CELERY__WORKER_QUEUES: "queue-s3,queue-s3-${HOSTNAME:-s3-001}"
AIRFLOW__CELERY__WORKER_TAGS: "s3"
AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
# S3 tasks are lightweight.
AIRFLOW__CELERY__WORKER_AUTOSCALE: "2,1"
AIRFLOW__CELERY__POOL: "prefork"
AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
AIRFLOW__CELERY__WORKER_NAME: "worker-s3@%h"
AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "262144" # 256MB
networks:
- default
- proxynet
restart: always
airflow-worker-auth:
<<: *airflow-common
container_name: airflow-worker-auth-1
@ -175,6 +220,46 @@ services:
- /var/run/docker.sock:/var/run/docker.sock:ro
restart: always
airflow-worker-mgmt:
<<: *airflow-common
container_name: airflow-worker-mgmt-1
hostname: ${HOSTNAME:-mgmt001}
# The Mgmt worker listens on the generic mgmt queue AND its own dedicated queue.
command: airflow celery worker -q queue-mgmt,queue-mgmt-${HOSTNAME:-mgmt001}
deploy:
resources:
limits:
memory: ${AIRFLOW_WORKER_MGMT_MEM_LIMIT:-2G}
reservations:
memory: ${AIRFLOW_WORKER_MGMT_MEM_RESERV:-512M}
healthcheck:
test:
- "CMD-SHELL"
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-mgmt@$$(hostname)"'
interval: 30s
timeout: 30s
retries: 5
start_period: 30s
environment:
<<: *airflow-common-env
HOSTNAME: ${HOSTNAME:-mgmt001}
DUMB_INIT_SETSID: "0"
AIRFLOW__CELERY__WORKER_QUEUES: "queue-mgmt,queue-mgmt-${HOSTNAME:-mgmt001}"
AIRFLOW__CELERY__WORKER_TAGS: "mgmt"
AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
# Mgmt tasks are lightweight.
AIRFLOW__CELERY__WORKER_AUTOSCALE: "4,2"
AIRFLOW__CELERY__POOL: "prefork"
AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
AIRFLOW__CELERY__WORKER_NAME: "worker-mgmt@%h"
AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "262144" # 256MB
networks:
- default
- proxynet
restart: always
networks:
proxynet:
name: airflow_proxynet

View File

@ -1,12 +1,4 @@
name: ytdlp-ops
{% if service_role is defined and service_role != 'management' %}
include:
# This automatically includes the generated camoufox service definitions and dependencies.
# It simplifies the docker-compose command, as you no longer need to specify both files with -f.
# The file is generated by the config-generator service and will be created even if empty.
- ./configs/docker-compose.camoufox.yaml
{% endif %}
services:
bgutil-provider:
image: brainicism/bgutil-ytdlp-pot-provider
@ -66,10 +58,6 @@ services:
depends_on:
context-prepper:
condition: service_completed_successfully
{% if service_role is defined and service_role != 'management' %}
camoufox-group:
condition: service_started
{% endif %}
# Ports are no longer exposed directly. Envoy will connect to them on the internal network.
# entrypoint:
# - /bin/sh

View File

@ -54,22 +54,6 @@ services:
- proxynet
{% endfor %}
{% if camoufox_proxies %}
# This service is a dependency anchor. The main services depend on it,
# and it in turn depends on all camoufox instances.
camoufox-group:
image: alpine:latest
command: ["echo", "Camoufox group ready."]
restart: "no"
depends_on:
{% for proxy in camoufox_proxies %}
{% set proxy_port = _get_port_from_proxy_url(proxy.url) | int %}
- camoufox-{{ proxy_port }}-{{ loop.index }}
{% endfor %}
networks:
- proxynet
{% endif %}
volumes:
{% for proxy in camoufox_proxies %}
{% set proxy_port = _get_port_from_proxy_url(proxy.url) | int %}

View File

@ -788,6 +788,7 @@ def manage_system_callable(**context):
with DAG(
dag_id="ytdlp_mgmt_proxy_account",
default_args={"queue": "queue-mgmt"},
start_date=days_ago(1),
schedule=None,
catchup=False,

View File

@ -591,6 +591,7 @@ with DAG(
"owner": "airflow",
"start_date": days_ago(1),
"retries": 0,
"queue": "queue-mgmt",
},
schedule=None,
catchup=False,

View File

@ -27,7 +27,7 @@ from airflow.utils.dates import days_ago
# Import utility functions and Thrift modules
from utils.redis_utils import _get_redis_client
from pangramia.yt.tokens_ops import YTTokenOpService
from pangramia.yt.management import YTManagementService
from thrift.protocol import TBinaryProtocol
from thrift.transport import TSocket, TTransport
@ -36,14 +36,14 @@ logger = logging.getLogger(__name__)
# Default settings from Airflow Variables or hardcoded fallbacks
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9980)
DEFAULT_MANAGEMENT_SERVICE_IP = Variable.get("MANAGEMENT_SERVICE_HOST", default_var="172.17.0.1")
DEFAULT_MANAGEMENT_SERVICE_PORT = Variable.get("MANAGEMENT_SERVICE_PORT", default_var=9080)
DEFAULT_ARGS = {
'owner': 'airflow',
'retries': 1,
'retry_delay': 30,
'queue': 'default',
'queue': 'queue-mgmt',
}
@ -55,7 +55,7 @@ def _get_thrift_client(host, port, timeout=60):
transport.setTimeout(timeout * 1000)
transport = TTransport.TFramedTransport(transport)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = YTTokenOpService.Client(protocol)
client = YTManagementService.Client(protocol)
transport.open()
logger.info(f"Connected to Thrift server at {host}:{port}")
return client, transport
@ -72,8 +72,8 @@ def manage_account_states(**context):
cooldown_duration_s = params['account_cooldown_duration_min'] * 60
ban_duration_s = params['account_ban_duration_hours'] * 3600
host = DEFAULT_YT_AUTH_SERVICE_IP
port = int(DEFAULT_YT_AUTH_SERVICE_PORT)
host = DEFAULT_MANAGEMENT_SERVICE_IP
port = int(DEFAULT_MANAGEMENT_SERVICE_PORT)
redis_conn_id = DEFAULT_REDIS_CONN_ID
logger.info(f"Starting account maintenance. Service: {host}:{port}, Redis: {redis_conn_id}")
logger.info(f"Using limits: Requests={requests_limit}, Cooldown={params['account_cooldown_duration_min']}m, Ban={params['account_ban_duration_hours']}h")
@ -230,8 +230,8 @@ with DAG(
This process gives full control over time-based account lifecycle management to the Airflow orchestrator.
""",
params={
'account_requests_limit': Param(250, type="integer", description="Number of successful requests an account can make before it is rested."),
'account_cooldown_duration_min': Param(60, type="integer", description="Duration in minutes an account must rest before being activated again. Default is 1 hour."),
'account_requests_limit': Param(250, type="integer", description="Number of successful requests an account can make before it is rested. Default is 250."),
'account_cooldown_duration_min': Param(60, type="integer", description="Duration in minutes an account must rest ('pause') before being activated again. Default is 60 minutes (1 hour)."),
'account_ban_duration_hours': Param(24, type="integer", description="Duration in hours an account stays banned before it can be un-banned."),
}
) as dag:

View File

@ -411,6 +411,7 @@ with DAG(
orchestrate_task = PythonOperator(
task_id='start_worker_loops',
python_callable=orchestrate_workers_ignition_callable,
queue='queue-mgmt',
)
orchestrate_task.doc_md = """
### Start Worker Loops

View File

@ -37,6 +37,7 @@ import socket
import time
import traceback
import uuid
import shutil
# Import utility functions and Thrift modules
from utils.redis_utils import _get_redis_client
@ -140,9 +141,14 @@ def _get_thrift_client(host, port, timeout):
return client, transport
def _extract_video_id(url):
"""Extracts YouTube video ID from URL."""
"""Extracts YouTube video ID from a URL or returns the input if it's already a valid ID."""
if not url or not isinstance(url, str):
return None
# Check if the input is already a valid 11-character video ID
if re.fullmatch(r'[a-zA-Z0-9_-]{11}', url):
return url
patterns = [r'v=([a-zA-Z0-9_-]{11})', r'youtu\.be/([a-zA-Z0-9_-]{11})']
for pattern in patterns:
match = re.search(pattern, url)
@ -299,7 +305,7 @@ def get_token(initial_data: dict, **context):
account_id = initial_data['account_id']
url = initial_data['url_to_process']
info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')
info_json_dir = os.path.join(Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles'), 'videos', 'in-progress')
host, port = params['service_ip'], int(params['service_port'])
machine_id = params.get('machine_id') or socket.gethostname()
@ -308,9 +314,11 @@ def get_token(initial_data: dict, **context):
assigned_proxy_url = params.get('assigned_proxy_url')
video_id = _extract_video_id(url)
os.makedirs(info_json_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
info_json_path = os.path.join(info_json_dir, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json")
job_dir_name = f"{timestamp}-{video_id or 'unknown'}"
job_dir_path = os.path.join(info_json_dir, job_dir_name)
os.makedirs(job_dir_path, exist_ok=True)
info_json_path = os.path.join(job_dir_path, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json")
cmd = [
'ytops-client', 'get-info',
@ -375,6 +383,7 @@ def get_token(initial_data: dict, **context):
return {
'info_json_path': info_json_path,
'job_dir_path': job_dir_path,
'socks_proxy': proxy,
'ytdlp_command': None,
'successful_account_id': account_id,
@ -653,7 +662,10 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
info_json_path = token_data.get('info_json_path')
proxy = token_data.get('socks_proxy')
original_url = token_data.get('original_url')
download_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles/video')
download_dir = token_data.get('job_dir_path')
if not download_dir:
# Fallback for older runs or if job_dir_path is missing
download_dir = os.path.dirname(info_json_path)
format_preset = params.get('download_format_preset', 'best_audio')
if format_preset == 'custom':
@ -678,6 +690,21 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
if not (info_json_path and os.path.exists(info_json_path)):
raise AirflowException(f"Error: info.json path is missing or file does not exist ({info_json_path}).")
# WORKAROUND: The auth service may inject a 'js_runtimes' key into the info.json
# that is incompatible with the yt-dlp library's expectations, causing a crash.
# We remove it here before passing it to the download tool.
try:
with open(info_json_path, 'r+', encoding='utf-8') as f:
info_data = json.load(f)
if 'js_runtimes' in info_data:
logger.info("Found 'js_runtimes' key in info.json. Removing it as a workaround for yt-dlp library incompatibility.")
del info_data['js_runtimes']
f.seek(0)
json.dump(info_data, f)
f.truncate()
except Exception as e:
logger.warning(f"Could not process/remove 'js_runtimes' from info.json: {e}", exc_info=True)
def run_yt_dlp_command(format_selector: str):
"""Constructs and runs a yt-ops-client download command, returning a list of final filenames."""
downloader = params.get('downloader', 'py')
@ -690,20 +717,26 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
cmd.extend(['--output-dir', download_dir])
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
# WORKAROUND: Due to an incompatibility between ytops-client and a recent yt-dlp
# library update, passing --extra-ytdlp-args to the 'py' downloader causes a crash.
# These arguments are being omitted until ytops-client is fixed.
# This affects: fragment_retries, limit_rate, socket_timeout, sleep_interval,
# max_sleep_interval, yt_dlp_test_mode, and the 'yt_dlp_extra_args' DAG param.
has_extra_args = (
params.get('fragment_retries') or params.get('limit_rate') or
params.get('socket_timeout') or params.get('min_sleep_interval') or
params.get('max_sleep_interval') or params.get('yt_dlp_test_mode') or
params.get('yt_dlp_extra_args')
)
if has_extra_args:
logger.warning("WORKAROUND: Omitting --extra-ytdlp-args for 'py' downloader due to a known incompatibility. "
"Some download parameters will be ignored.")
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
py_extra_args = []
if params.get('fragment_retries'):
py_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])])
if params.get('limit_rate'):
py_extra_args.extend(['--limit-rate', params['limit_rate']])
if params.get('socket_timeout'):
py_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
if params.get('min_sleep_interval'):
py_extra_args.extend(['--sleep-interval', str(params['min_sleep_interval'])])
if params.get('max_sleep_interval'):
py_extra_args.extend(['--max-sleep-interval', str(params['max_sleep_interval'])])
if params.get('yt_dlp_test_mode'):
py_extra_args.append('--test')
existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '')
final_extra_args_list = existing_extra + py_extra_args
if final_extra_args_list:
final_extra_args_str = shlex.join(final_extra_args_list)
cmd.extend(['--extra-ytdlp-args', final_extra_args_str])
elif downloader == 'aria-rpc':
cmd.extend([
@ -744,7 +777,10 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)])
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
logger.info(f"Executing download command for format '{format_selector}': {copy_paste_cmd}")
logger.info(f"--- Preparing to execute ytops-client ---")
logger.info(f"Full ytops-client command for format '{format_selector}':")
logger.info(copy_paste_cmd)
logger.info(f"-----------------------------------------")
process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
if process.stdout:
@ -855,9 +891,20 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
final_formats_to_download = formats_to_download_initial
else:
for selector in formats_to_download_initial:
# A selector can be '140' or '299/298/137'
# A selector can be '140' or '299/298/137' or '140-dashy'
individual_ids = re.split(r'[/+]', selector)
if any(fid in available_formats for fid in individual_ids):
# Extract the numeric part of the format ID for checking against available_formats
is_available = False
for fid in individual_ids:
numeric_id_match = re.match(r'^\d+', fid)
if numeric_id_match:
numeric_id = numeric_id_match.group(0)
if numeric_id in available_formats:
is_available = True
break # Found a match, no need to check other parts of the selector
if is_available:
final_formats_to_download.append(selector)
else:
logger.warning(f"Requested format selector '{selector}' contains no available formats. Skipping.")
@ -872,54 +919,51 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
logger.info(f"Test mode: yt-dlp returned {len(successful_files)} filenames. Skipping probe failure checks.")
if not successful_files:
raise AirflowException("Test run did not produce any filenames.")
return successful_files
# Do not return here. Proceed to the cleanup and move logic.
if not failed_files:
if not successful_files:
raise AirflowException("Download and probe process completed but produced no valid files.")
return successful_files
final_success_list = successful_files
if failed_files:
# --- Handle Probe Failures and Retry ---
if not retry_on_probe_failure:
raise AirflowException(f"Probe failed for {len(failed_files)} file(s) and retry is disabled: {failed_files}")
# --- Handle Probe Failures and Retry ---
if not retry_on_probe_failure:
raise AirflowException(f"Probe failed for {len(failed_files)} file(s) and retry is disabled: {failed_files}")
logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...")
logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...")
format_ids_to_retry = []
# Since each download is now for a specific selector and the output template
# includes the format_id, we can always attempt to extract the format_id
# from the failed filename for a targeted retry.
for f in failed_files:
match = re.search(r'\.f([\d]+)\.', f)
if match:
format_ids_to_retry.append(match.group(1))
else:
logger.error(f"Could not extract format_id from failed file '{f}'. Cannot retry this specific file.")
formats_to_download_retry = format_ids_to_retry
format_ids_to_retry = []
# Since each download is now for a specific selector and the output template
# includes the format_id, we can always attempt to extract the format_id
# from the failed filename for a targeted retry.
for f in failed_files:
match = re.search(r'\.f([\d]+)\.', f)
if match:
format_ids_to_retry.append(match.group(1))
else:
logger.error(f"Could not extract format_id from failed file '{f}'. Cannot retry this specific file.")
formats_to_download_retry = format_ids_to_retry
if not formats_to_download_retry:
raise AirflowException("Probe failed, but could not determine which formats to retry.")
if not formats_to_download_retry:
raise AirflowException("Probe failed, but could not determine which formats to retry.")
# Rename failed files to allow for a fresh download attempt
for f in failed_files:
try:
failed_path = f"{f}.probe_failed_{int(time.time())}"
os.rename(f, failed_path)
logger.info(f"Renamed corrupted file to {failed_path}")
except OSError as rename_err:
logger.error(f"Could not rename corrupted file '{f}': {rename_err}")
# Rename failed files to allow for a fresh download attempt
for f in failed_files:
try:
failed_path = f"{f}.probe_failed_{int(time.time())}"
os.rename(f, failed_path)
logger.info(f"Renamed corrupted file to {failed_path}")
except OSError as rename_err:
logger.error(f"Could not rename corrupted file '{f}': {rename_err}")
# --- Retry Download and Probe ---
retried_successful_files, retried_failed_files = _download_and_probe_formats(formats_to_download_retry)
# --- Retry Download and Probe ---
retried_successful_files, retried_failed_files = _download_and_probe_formats(formats_to_download_retry)
if retried_failed_files:
logger.error(f"Probe failed again for {len(retried_failed_files)} file(s) after retry: {retried_failed_files}")
if retried_failed_files:
logger.error(f"Probe failed again for {len(retried_failed_files)} file(s) after retry: {retried_failed_files}")
final_success_list = successful_files + retried_successful_files
logger.info(f"Retry complete. Final success count: {len(final_success_list)} file(s).")
final_success_list = successful_files + retried_successful_files
if not final_success_list:
raise AirflowException("All files failed to download or probe correctly, even after retry.")
logger.info(f"Retry complete. Final success count: {len(final_success_list)} file(s).")
raise AirflowException("Download and probe process completed but produced no valid files.")
if params.get('yt_dlp_cleanup_mode', True):
logger.info(f"Cleanup mode is enabled. Creating .empty files and deleting originals for {len(final_success_list)} files.")
@ -935,6 +979,35 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
logger.error(f"Error during cleanup for file {f}: {e}", exc_info=True)
# Do not fail the task for a cleanup error, just log it.
# --- Move completed job directory to final destination ---
try:
video_id = _extract_video_id(original_url)
if not video_id:
logger.error(f"Could not extract video_id from URL '{original_url}' for final move. Skipping.")
else:
source_dir = download_dir # This is the job_dir_path
# Group downloads into 10-minute batch folders based on completion time.
now = datetime.now()
rounded_minute = (now.minute // 10) * 10
timestamp_str = now.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}"
final_dir_base = os.path.join(Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles'), 'videos', 'ready', timestamp_str)
final_dir_path = os.path.join(final_dir_base, video_id)
os.makedirs(final_dir_base, exist_ok=True)
logger.info(f"Moving completed job from '{source_dir}' to final destination '{final_dir_path}'")
if os.path.exists(final_dir_path):
logger.warning(f"Destination '{final_dir_path}' already exists. It will be removed and replaced.")
shutil.rmtree(final_dir_path)
shutil.move(source_dir, final_dir_path)
logger.info(f"Successfully moved job to '{final_dir_path}'.")
except Exception as e:
logger.error(f"Failed to move completed job directory: {e}", exc_info=True)
# Do not fail the task for a move error, just log it.
return final_success_list
except Exception as e:
if 'HTTP Error 403: Forbidden' in str(e):
@ -1464,7 +1537,7 @@ with DAG(
'assigned_proxy_url': Param(None, type=["string", "null"], title="[Worker Param] Assigned Proxy URL", description="A specific proxy URL to use for the request, overriding the server's proxy pool logic."),
'clients': Param('tv_simply', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"),
'timeout': Param(DEFAULT_TIMEOUT, type="integer"),
'output_path_template': Param("%(title)s [%(id)s].f%(format_id)s.%(ext)s", type="string", title="[Worker Param] Output Path Template", description="Output filename template for yt-dlp. It is highly recommended to include `%(format_id)s` to prevent filename collisions when downloading multiple formats."),
'output_path_template': Param("%(id)s.f%(format_id)s.%(ext)s", type="string", title="[Worker Param] Output Path Template", description="Output filename template for yt-dlp. It is highly recommended to include `%(format_id)s` to prevent filename collisions when downloading multiple formats."),
'on_auth_failure': Param(
'retry_with_new_account',
type="string",

View File

@ -322,6 +322,7 @@ with DAG(
orchestrate_task = PythonOperator(
task_id='start_worker_loops',
python_callable=orchestrate_workers_ignition_callable,
queue='queue-mgmt',
)
orchestrate_task.doc_md = """
### Start Worker Loops

View File

@ -290,6 +290,7 @@ with DAG(
orchestrate_task = PythonOperator(
task_id='start_worker_loops',
python_callable=orchestrate_workers_ignition_callable,
queue='queue-mgmt',
)
orchestrate_task.doc_md = """
### Start Worker Loops

View File

@ -175,9 +175,14 @@ def _get_thrift_client(host, port, timeout):
return client, transport
def _extract_video_id(url):
"""Extracts YouTube video ID from URL."""
"""Extracts YouTube video ID from a URL or returns the input if it's already a valid ID."""
if not url or not isinstance(url, str):
return None
# Check if the input is already a valid 11-character video ID
if re.fullmatch(r'[a-zA-Z0-9_-]{11}', url):
return url
patterns = [r'v=([a-zA-Z0-9_-]{11})', r'youtu\.be/([a-zA-Z0-9_-]{11})']
for pattern in patterns:
match = re.search(pattern, url)
@ -389,7 +394,7 @@ def get_token(initial_data: dict, **context):
account_id = initial_data['account_id']
url = initial_data['url_to_process']
info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')
info_json_dir = os.path.join(Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles'), 'videos', 'in-progress')
host, port = params['service_ip'], int(params['service_port'])
machine_id = params.get('machine_id') or socket.gethostname()
@ -468,6 +473,7 @@ def get_token(initial_data: dict, **context):
return {
'info_json_path': info_json_path,
'job_dir_path': job_dir_path,
'socks_proxy': proxy,
'ytdlp_command': None,
'successful_account_id': account_id,

View File

@ -36,6 +36,7 @@ import socket
import time
import traceback
import uuid
import shutil
# Import utility functions and Thrift modules
from utils.redis_utils import _get_redis_client
@ -128,9 +129,14 @@ DEFAULT_ARGS = {
# --- Helper Functions ---
def _extract_video_id(url):
"""Extracts YouTube video ID from URL."""
"""Extracts YouTube video ID from a URL or returns the input if it's already a valid ID."""
if not url or not isinstance(url, str):
return None
# Check if the input is already a valid 11-character video ID
if re.fullmatch(r'[a-zA-Z0-9_-]{11}', url):
return url
patterns = [r'v=([a-zA-Z0-9_-]{11})', r'youtu\.be/([a-zA-Z0-9_-]{11})']
for pattern in patterns:
match = re.search(pattern, url)
@ -288,7 +294,25 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
if not (info_json_path and os.path.exists(info_json_path)):
raise AirflowException(f"Error: info.json path is missing or file does not exist ({info_json_path}).")
download_dir = os.path.dirname(info_json_path)
# WORKAROUND: The auth service may inject a 'js_runtimes' key into the info.json
# that is incompatible with the yt-dlp library's expectations, causing a crash.
# We remove it here before passing it to the download tool.
try:
with open(info_json_path, 'r+', encoding='utf-8') as f:
info_data = json.load(f)
if 'js_runtimes' in info_data:
logger.info("Found 'js_runtimes' key in info.json. Removing it as a workaround for yt-dlp library incompatibility.")
del info_data['js_runtimes']
f.seek(0)
json.dump(info_data, f)
f.truncate()
except Exception as e:
logger.warning(f"Could not process/remove 'js_runtimes' from info.json: {e}", exc_info=True)
download_dir = token_data.get('job_dir_path')
if not download_dir:
# Fallback for older runs or if job_dir_path is missing
download_dir = os.path.dirname(info_json_path)
format_preset = params.get('download_format_preset', 'best_audio')
if format_preset == 'custom':
@ -322,20 +346,26 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
cmd.extend(['--output-dir', download_dir])
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
# WORKAROUND: Due to an incompatibility between ytops-client and a recent yt-dlp
# library update, passing --extra-ytdlp-args to the 'py' downloader causes a crash.
# These arguments are being omitted until ytops-client is fixed.
# This affects: fragment_retries, limit_rate, socket_timeout, sleep_interval,
# max_sleep_interval, yt_dlp_test_mode, and the 'yt_dlp_extra_args' DAG param.
has_extra_args = (
params.get('fragment_retries') or params.get('limit_rate') or
params.get('socket_timeout') or params.get('min_sleep_interval') or
params.get('max_sleep_interval') or params.get('yt_dlp_test_mode') or
params.get('yt_dlp_extra_args')
)
if has_extra_args:
logger.warning("WORKAROUND: Omitting --extra-ytdlp-args for 'py' downloader due to a known incompatibility. "
"Some download parameters will be ignored.")
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
py_extra_args = []
if params.get('fragment_retries'):
py_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])])
if params.get('limit_rate'):
py_extra_args.extend(['--limit-rate', params['limit_rate']])
if params.get('socket_timeout'):
py_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
if params.get('min_sleep_interval'):
py_extra_args.extend(['--sleep-interval', str(params['min_sleep_interval'])])
if params.get('max_sleep_interval'):
py_extra_args.extend(['--max-sleep-interval', str(params['max_sleep_interval'])])
if params.get('yt_dlp_test_mode'):
py_extra_args.append('--test')
existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '')
final_extra_args_list = existing_extra + py_extra_args
if final_extra_args_list:
final_extra_args_str = shlex.join(final_extra_args_list)
cmd.extend(['--extra-ytdlp-args', final_extra_args_str])
elif downloader == 'aria-rpc':
cmd.extend([
@ -376,7 +406,10 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)])
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
logger.info(f"Executing download command for format '{format_selector}': {copy_paste_cmd}")
logger.info(f"--- Preparing to execute ytops-client ---")
logger.info(f"Full ytops-client command for format '{format_selector}':")
logger.info(copy_paste_cmd)
logger.info(f"-----------------------------------------")
process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
if process.stdout:
@ -487,9 +520,20 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
final_formats_to_download = formats_to_download_initial
else:
for selector in formats_to_download_initial:
# A selector can be '140' or '299/298/137'
# A selector can be '140' or '299/298/137' or '140-dashy'
individual_ids = re.split(r'[/+]', selector)
if any(fid in available_formats for fid in individual_ids):
# Extract the numeric part of the format ID for checking against available_formats
is_available = False
for fid in individual_ids:
numeric_id_match = re.match(r'^\d+', fid)
if numeric_id_match:
numeric_id = numeric_id_match.group(0)
if numeric_id in available_formats:
is_available = True
break # Found a match, no need to check other parts of the selector
if is_available:
final_formats_to_download.append(selector)
else:
logger.warning(f"Requested format selector '{selector}' contains no available formats. Skipping.")
@ -504,54 +548,51 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
logger.info(f"Test mode: yt-dlp returned {len(successful_files)} filenames. Skipping probe failure checks.")
if not successful_files:
raise AirflowException("Test run did not produce any filenames.")
return successful_files
# Do not return here. Proceed to the cleanup and move logic.
if not failed_files:
if not successful_files:
raise AirflowException("Download and probe process completed but produced no valid files.")
return successful_files
final_success_list = successful_files
if failed_files:
# --- Handle Probe Failures and Retry ---
if not retry_on_probe_failure:
raise AirflowException(f"Probe failed for {len(failed_files)} file(s) and retry is disabled: {failed_files}")
# --- Handle Probe Failures and Retry ---
if not retry_on_probe_failure:
raise AirflowException(f"Probe failed for {len(failed_files)} file(s) and retry is disabled: {failed_files}")
logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...")
logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...")
format_ids_to_retry = []
# Since each download is now for a specific selector and the output template
# includes the format_id, we can always attempt to extract the format_id
# from the failed filename for a targeted retry.
for f in failed_files:
match = re.search(r'\.f([\d]+)\.', f)
if match:
format_ids_to_retry.append(match.group(1))
else:
logger.error(f"Could not extract format_id from failed file '{f}'. Cannot retry this specific file.")
formats_to_download_retry = format_ids_to_retry
format_ids_to_retry = []
# Since each download is now for a specific selector and the output template
# includes the format_id, we can always attempt to extract the format_id
# from the failed filename for a targeted retry.
for f in failed_files:
match = re.search(r'\.f([\d]+)\.', f)
if match:
format_ids_to_retry.append(match.group(1))
else:
logger.error(f"Could not extract format_id from failed file '{f}'. Cannot retry this specific file.")
formats_to_download_retry = format_ids_to_retry
if not formats_to_download_retry:
raise AirflowException("Probe failed, but could not determine which formats to retry.")
if not formats_to_download_retry:
raise AirflowException("Probe failed, but could not determine which formats to retry.")
# Rename failed files to allow for a fresh download attempt
for f in failed_files:
try:
failed_path = f"{f}.probe_failed_{int(time.time())}"
os.rename(f, failed_path)
logger.info(f"Renamed corrupted file to {failed_path}")
except OSError as rename_err:
logger.error(f"Could not rename corrupted file '{f}': {rename_err}")
# Rename failed files to allow for a fresh download attempt
for f in failed_files:
try:
failed_path = f"{f}.probe_failed_{int(time.time())}"
os.rename(f, failed_path)
logger.info(f"Renamed corrupted file to {failed_path}")
except OSError as rename_err:
logger.error(f"Could not rename corrupted file '{f}': {rename_err}")
# --- Retry Download and Probe ---
retried_successful_files, retried_failed_files = _download_and_probe_formats(formats_to_download_retry)
# --- Retry Download and Probe ---
retried_successful_files, retried_failed_files = _download_and_probe_formats(formats_to_download_retry)
if retried_failed_files:
logger.error(f"Probe failed again for {len(retried_failed_files)} file(s) after retry: {retried_failed_files}")
if retried_failed_files:
logger.error(f"Probe failed again for {len(retried_failed_files)} file(s) after retry: {retried_failed_files}")
final_success_list = successful_files + retried_successful_files
logger.info(f"Retry complete. Final success count: {len(final_success_list)} file(s).")
final_success_list = successful_files + retried_successful_files
if not final_success_list:
raise AirflowException("All files failed to download or probe correctly, even after retry.")
logger.info(f"Retry complete. Final success count: {len(final_success_list)} file(s).")
raise AirflowException("Download and probe process completed but produced no valid files.")
if params.get('yt_dlp_cleanup_mode', True):
logger.info(f"Cleanup mode is enabled. Creating .empty files and deleting originals for {len(final_success_list)} files.")
@ -567,6 +608,35 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
logger.error(f"Error during cleanup for file {f}: {e}", exc_info=True)
# Do not fail the task for a cleanup error, just log it.
# --- Move completed job directory to final destination ---
try:
video_id = _extract_video_id(original_url)
if not video_id:
logger.error(f"Could not extract video_id from URL '{original_url}' for final move. Skipping.")
else:
source_dir = download_dir # This is the job_dir_path
# Group downloads into 10-minute batch folders based on completion time.
now = datetime.now()
rounded_minute = (now.minute // 10) * 10
timestamp_str = now.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}"
final_dir_base = os.path.join(Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles'), 'videos', 'ready', timestamp_str)
final_dir_path = os.path.join(final_dir_base, video_id)
os.makedirs(final_dir_base, exist_ok=True)
logger.info(f"Moving completed job from '{source_dir}' to final destination '{final_dir_path}'")
if os.path.exists(final_dir_path):
logger.warning(f"Destination '{final_dir_path}' already exists. It will be removed and replaced.")
shutil.rmtree(final_dir_path)
shutil.move(source_dir, final_dir_path)
logger.info(f"Successfully moved job to '{final_dir_path}'.")
except Exception as e:
logger.error(f"Failed to move completed job directory: {e}", exc_info=True)
# Do not fail the task for a move error, just log it.
return final_success_list
@task
@ -799,7 +869,7 @@ with DAG(
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string"),
'machine_id': Param(None, type=["string", "null"]),
'clients': Param('mweb,web_camoufox,tv', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"),
'output_path_template': Param("%(title)s [%(id)s].f%(format_id)s.%(ext)s", type="string", title="[Worker Param] Output Path Template", description="Output filename template for yt-dlp. It is highly recommended to include `%(format_id)s` to prevent filename collisions when downloading multiple formats."),
'output_path_template': Param("%(id)s.f%(format_id)s.%(ext)s", type="string", title="[Worker Param] Output Path Template", description="Output filename template for yt-dlp. It is highly recommended to include `%(format_id)s` to prevent filename collisions when downloading multiple formats."),
'retry_on_probe_failure': Param(False, type="boolean"),
'skip_probe': Param(False, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
'yt_dlp_cleanup_mode': Param(True, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),

View File

@ -0,0 +1,417 @@
# -*- coding: utf-8 -*-
"""
DAG to upload completed video directories to an S3-compatible service.
This DAG creates one long-running task for each configured S3 worker.
"""
from __future__ import annotations
import logging
import os
import shutil
import subprocess
import time
from datetime import datetime, timedelta
from airflow.decorators import task
from airflow.exceptions import AirflowException
from airflow.models.dag import DAG
from airflow.models.param import Param
from airflow.models.variable import Variable
from airflow.operators.dummy import DummyOperator
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.dates import days_ago
logger = logging.getLogger(__name__)
DEFAULT_ARGS = {
'owner': 'airflow',
'retries': 1,
'retry_delay': timedelta(minutes=1),
}
BASE_DOWNLOAD_PATH = '/opt/airflow/downloadfiles'
VIDEOS_PATH = os.path.join(BASE_DOWNLOAD_PATH, 'videos')
READY_PATH = os.path.join(VIDEOS_PATH, 'ready')
def run_s3_upload_batch(**context):
"""
This function runs in a continuous loop to check for completed video directories and upload them to S3.
If no videos are found, it sleeps for a configurable interval before checking again.
Dry run mode is non-destructive and will pause briefly after checking to prevent tight loops.
"""
params = context['params']
concurrency = params['concurrency']
mode = params['mode']
dry_run = params['dry_run']
sleep_interval_min = params['sleep_if_no_videos_min']
sleep_interval_sec = sleep_interval_min * 60
s3_conn_id = params['s3_conn_id']
s3_access_key_id = None
s3_secret_access_key = None
s3_endpoint = None
s3_bucket = None
s3_region = None
config_source = "Unknown"
profile_name = "rusonyx"
# --- Attempt 1: Get S3 Configuration from Airflow Connection ---
if s3_conn_id:
try:
logger.info(f"Attempting to load S3 configuration from Airflow connection '{s3_conn_id}'.")
s3_hook = S3Hook(aws_conn_id=s3_conn_id)
s3_conn = s3_hook.get_connection(s3_conn_id)
s3_access_key_id = s3_conn.login
s3_secret_access_key = s3_conn.password
s3_endpoint = s3_conn.host
extra_config = s3_conn.extra_dejson
s3_bucket = extra_config.get('bucket')
s3_region = extra_config.get('region_name')
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_bucket, s3_region]):
logger.warning("S3 connection from Airflow is missing one or more required fields. Will attempt to fall back to environment variables.")
s3_access_key_id = s3_secret_access_key = s3_endpoint = s3_bucket = s3_region = None # Reset all
else:
config_source = f"Airflow Connection '{s3_conn_id}'"
profile_name = "rusonyx-airflow"
except Exception as e:
logger.warning(f"Failed to load S3 configuration from Airflow connection '{s3_conn_id}': {e}. Will attempt to fall back to environment variables.")
# --- Attempt 2: Fallback to Environment Variables ---
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_bucket, s3_region]):
try:
logger.info("Attempting to load S3 configuration from environment variables as a fallback.")
s3_access_key_id = os.environ['S3_DELIVERY_AWS_ACCESS_KEY_ID']
s3_secret_access_key = os.environ['S3_DELIVERY_AWS_SECRET_ACCESS_KEY']
s3_endpoint = os.environ['S3_DELIVERY_ENDPOINT']
s3_bucket = os.environ['S3_DELIVERY_BUCKET']
s3_region = os.environ['S3_DELIVERY_AWS_REGION']
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_bucket, s3_region]):
raise ValueError("One or more S3 configuration environment variables are empty.")
config_source = "Environment Variables"
profile_name = "rusonyx"
except (KeyError, ValueError) as e:
logger.error(f"Having problems reading S3 configuration from environment variables: {e}", exc_info=True)
raise AirflowException("S3 configuration is missing. Could not load from Airflow connection or environment variables.")
s3_destination = f"s3://{s3_bucket}/"
logger.info(f"Starting S3 upload loop. Watching source '{READY_PATH}' for delivery to '{s3_destination}'.")
logger.info(f"Mode: {mode}, Dry Run: {dry_run}, Idle Sleep: {sleep_interval_min} min")
logger.info(f"S3 Config loaded from {config_source}: Endpoint='{s3_endpoint}', Bucket='{s3_bucket}', Region='{s3_region}', Profile='{profile_name}'")
# --- Write credentials to file for s5cmd profile ---
aws_credentials_path = os.path.expanduser("~/.aws/credentials")
aws_config_path = os.path.expanduser("~/.aws/config")
try:
os.makedirs(os.path.dirname(aws_credentials_path), exist_ok=True)
with open(aws_credentials_path, 'w') as f:
f.write(f"[{profile_name}]\n")
f.write(f"aws_access_key_id = {s3_access_key_id}\n")
f.write(f"aws_secret_access_key = {s3_secret_access_key}\n")
logger.info(f"Wrote credentials for profile '{profile_name}' to {aws_credentials_path}")
with open(aws_config_path, 'w') as f:
f.write(f"[profile {profile_name}]\n")
f.write(f"region = {s3_region}\n")
logger.info(f"Wrote config for profile '{profile_name}' to {aws_config_path}")
except Exception as e:
logger.error(f"Failed to write AWS credentials/config file: {e}", exc_info=True)
raise AirflowException(f"Failed to write AWS credentials/config file: {e}")
while True:
logger.info("--- Starting new S3 upload cycle ---")
# --- Dry Run Logic (Non-destructive) ---
if dry_run:
logger.info("[DRY RUN] Checking for completed video batches...")
if not os.path.exists(READY_PATH):
logger.info(f"[DRY RUN] Source directory '{READY_PATH}' does not exist. Nothing to upload.")
else:
now = datetime.now()
wait_minutes = params['batch_completion_wait_min']
cutoff_time = now - timedelta(minutes=wait_minutes)
rounded_minute = (cutoff_time.minute // 10) * 10
cutoff_batch_ts = cutoff_time.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}"
logger.info(f"[DRY RUN] Current time is {now.strftime('%H:%M:%S')}. With a {wait_minutes} min wait, processing batches up to and including '{cutoff_batch_ts}'.")
all_video_dirs_to_process = []
processed_batch_dirs = set()
all_batch_dirs = sorted([d for d in os.listdir(READY_PATH) if os.path.isdir(os.path.join(READY_PATH, d))])
for ts_dir in all_batch_dirs:
if ts_dir > cutoff_batch_ts:
continue
batch_dir_path = os.path.join(READY_PATH, ts_dir)
video_dirs_in_batch = [os.path.join(batch_dir_path, d) for d in os.listdir(batch_dir_path) if os.path.isdir(os.path.join(batch_dir_path, d))]
if video_dirs_in_batch:
all_video_dirs_to_process.extend(video_dirs_in_batch)
processed_batch_dirs.add(batch_dir_path)
else:
logger.info(f"[DRY RUN] Batch directory '{batch_dir_path}' is empty. Would remove it.")
if all_video_dirs_to_process:
logger.info(f"[DRY RUN] Found {len(all_video_dirs_to_process)} total video director(y/ies) in {len(processed_batch_dirs)} batch(es) to process.")
# Construct and log the command that would be run
cmd = [
's5cmd', '--endpoint-url', s3_endpoint, '--log', 'debug', '--no-verify-ssl',
'--use-list-objects-v1', '--profile', profile_name, '--stat',
'--numworkers', str(concurrency), 'run'
]
cmd_str = ' '.join(cmd)
# Construct the commands to be piped
commands_to_pipe = '\n'.join([f"cp \"{dir_path}\" \"{s3_destination}\"" for dir_path in all_video_dirs_to_process])
logger.info(f"[DRY RUN] The following command would be executed:\n{cmd_str}")
logger.info(f"[DRY RUN] The following commands would be piped to stdin:\n{commands_to_pipe}")
if mode == 'mv':
logger.info(f"[DRY RUN] Mode is 'mv'. Would delete {len(processed_batch_dirs)} source batch directories after successful upload.")
# Pause briefly in dry-run mode if videos are found to avoid a fast, noisy loop.
dry_run_pause_s = 10
logger.info(f"[DRY RUN] Pausing for {dry_run_pause_s} seconds to prevent rapid re-listing of the same files (this is a short, fixed pause for dry-run only).")
time.sleep(dry_run_pause_s)
continue # Go to the start of the next cycle
else:
logger.info("[DRY RUN] No completed video batches found.")
# If in dry-run and no videos are found, sleep for the main interval.
logger.info(f"[DRY RUN] Sleeping for {sleep_interval_min} minute(s)...")
time.sleep(sleep_interval_sec)
continue
# --- Normal Operation Logic (Destructive) ---
work_done_in_cycle = False
try:
# --- 1. Find all videos to upload from all completed batches ---
if not os.path.exists(READY_PATH):
logger.info(f"Ready directory '{READY_PATH}' does not exist. Nothing to upload.")
else:
now = datetime.now()
wait_minutes = params['batch_completion_wait_min']
cutoff_time = now - timedelta(minutes=wait_minutes)
rounded_minute = (cutoff_time.minute // 10) * 10
cutoff_batch_ts = cutoff_time.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}"
logger.info(f"Current time is {now.strftime('%H:%M:%S')}. With a {wait_minutes} min wait, processing batches up to and including '{cutoff_batch_ts}'.")
all_video_dirs_to_process = []
processed_batch_dirs = set()
all_batch_dirs = sorted([d for d in os.listdir(READY_PATH) if os.path.isdir(os.path.join(READY_PATH, d))])
for ts_dir in all_batch_dirs:
if ts_dir > cutoff_batch_ts:
continue # This batch is not old enough to be processed
batch_dir_path = os.path.join(READY_PATH, ts_dir)
video_dirs_in_batch = [os.path.join(batch_dir_path, d) for d in os.listdir(batch_dir_path) if os.path.isdir(os.path.join(batch_dir_path, d))]
if not video_dirs_in_batch:
logger.info(f"Batch directory '{batch_dir_path}' is empty. Removing it.")
try:
os.rmdir(batch_dir_path)
except OSError as e:
logger.error(f"Could not remove empty batch directory {batch_dir_path}: {e}")
continue # Move to the next batch
all_video_dirs_to_process.extend(video_dirs_in_batch)
processed_batch_dirs.add(batch_dir_path)
# --- 2. Upload All Found Videos in a Single Batch Command ---
if all_video_dirs_to_process:
work_done_in_cycle = True
logger.info(f"Found {len(all_video_dirs_to_process)} total video director(y/ies) in {len(processed_batch_dirs)} batch(es) to upload.")
cmd = [
's5cmd', '--endpoint-url', s3_endpoint, '--log', 'debug', '--no-verify-ssl',
'--use-list-objects-v1', '--profile', profile_name, '--stat',
'--numworkers', str(concurrency), 'run'
]
cmd_str = ' '.join(cmd)
# Construct the commands to be piped to stdin
commands_to_pipe = '\n'.join([f"cp \"{dir_path}\" \"{s3_destination}\"" for dir_path in all_video_dirs_to_process])
logger.info(f"Executing s5cmd batch command:\n{cmd_str}")
logger.info(f"Piping {len(all_video_dirs_to_process)} 'cp' commands to stdin.")
upload_start_time = time.time()
process = subprocess.run(cmd, check=True, capture_output=True, text=True, input=commands_to_pipe)
upload_duration = time.time() - upload_start_time
logger.info(f"s5cmd STDOUT: {process.stdout}")
if process.stderr:
logger.info(f"s5cmd STDERR: {process.stderr}")
logger.info(f"Upload command completed successfully in {upload_duration:.2f} seconds.")
logger.info(f"Successfully copied {len(all_video_dirs_to_process)} director(y/ies) to S3.")
# --- 3. Cleanup ---
if mode == 'mv':
logger.info(f"Mode is 'mv'. Cleaning up {len(processed_batch_dirs)} source batch director(y/ies).")
cleanup_start_time = time.time()
# Create a temporary empty directory to use as a source for rsync deletion
empty_dir_for_rsync = os.path.join(READY_PATH, f"__empty_{int(time.time())}")
os.makedirs(empty_dir_for_rsync, exist_ok=True)
try:
for batch_dir_path in processed_batch_dirs:
try:
# Use rsync with an empty source to efficiently delete the contents of the batch directory
# The trailing slash on both source and destination is important.
rsync_cmd = [
'rsync',
'-a', '--delete',
f'{empty_dir_for_rsync}/',
f'{batch_dir_path}/'
]
subprocess.run(rsync_cmd, check=True, capture_output=True, text=True)
# After the contents are deleted, remove the now-empty directory
os.rmdir(batch_dir_path)
logger.info(f"Successfully removed {batch_dir_path}")
except Exception as cleanup_e:
logger.error(f"Failed to remove directory {batch_dir_path}: {cleanup_e}", exc_info=True)
if isinstance(cleanup_e, subprocess.CalledProcessError):
logger.error(f"rsync STDERR: {cleanup_e.stderr}")
finally:
# Clean up the temporary empty directory
shutil.rmtree(empty_dir_for_rsync)
cleanup_duration = time.time() - cleanup_start_time
logger.info(f"Cleanup complete in {cleanup_duration:.2f} seconds.")
else: # mode == 'cp'
logger.info(f"Mode is 'cp'. Source directories will be left for inspection.")
if not work_done_in_cycle:
logger.info(f"No completed video batches found in '{READY_PATH}'.")
except Exception as e:
logger.error(f"An error occurred during the S3 upload cycle: {e}", exc_info=True)
if isinstance(e, subprocess.CalledProcessError):
logger.error(f"s5cmd STDERR: {e.stderr}")
# On error, we do NOT clean up, to allow for investigation and retries.
# The failed directories will be picked up in the next cycle.
# Treat errors as "no work done" to trigger sleep and prevent fast failure loops
work_done_in_cycle = False
# --- Loop Control ---
if not work_done_in_cycle:
logger.info(f"No work done in this cycle. Sleeping for {sleep_interval_min} minute(s)...")
time.sleep(sleep_interval_sec)
else:
logger.info("Work was completed in this cycle. Checking for more immediately.")
with DAG(
dag_id='ytdlp_s3_uploader',
default_args=DEFAULT_ARGS,
schedule=None,
start_date=days_ago(1),
catchup=False,
tags=['ytdlp', 's3', 'upload'],
doc_md="""### S3 Uploader DAG
1. This DAG creates dynamic uploader tasks with clear names depicting their worker machine (e.g., `upload_batch_on_dl001`).
2. Ansible updates an Airflow Variable named `s3_worker_hostnames` with a JSON list of all active uploader workers (typically dlXXX machines). Each worker listens to its own queue (e.g., `queue-dl-dl001`).
3. This DAG reads the variable on manual trigger or after a pause/resume cycle to create the dynamic tasks. This allows for easy inspection of per-worker logs and status from the Airflow UI.
4. Each dynamic task watches a shared folder (`/opt/airflow/downloadfiles/videos/ready`). Download workers place completed videos into timestamped sub-folders (e.g., `20241122T1050`). The uploader processes these 10-minute batches, copying them to S3 with `s5cmd` and then deleting the source directories. This design avoids race conditions and improves performance.
""",
params={
'mode': Param(
'mv', type="string", enum=['cp', 'mv'], title="Operation Mode",
description="`mv` (move): After a successful upload, the temporary batch directory is deleted. This is the standard behavior. `cp` (copy): The temporary batch directory is left intact for debugging; it will be cleaned up on the next run."
),
'dry_run': Param(
True, type="boolean", title="Dry Run",
description="If True, the DAG will perform all steps except the actual upload and cleanup. `s5cmd` will be run with `--dry-run`, and the final directory removal will be skipped. Log messages will indicate what would have happened."
),
'concurrency': Param(10, type="integer", title="s5cmd Concurrency"),
'sleep_if_no_videos_min': Param(10, type="integer", title="Sleep if Idle (minutes)", description="How many minutes the task should sleep if no videos are found to upload."),
'batch_completion_wait_min': Param(0, type="integer", title="Batch Completion Wait (minutes)", description="How many minutes to wait after a 10-minute batch window closes before considering it for upload. Default is 0, which processes the current batch immediately. A value of 10 restores the old behavior of waiting for the next 10-minute window."),
's3_conn_id': Param('s3_delivery_connection', type="string", title="S3 Connection ID", description="The Airflow connection ID for the S3-compatible storage. If this connection is invalid or missing, the task will fall back to environment variables."),
}
) as dag:
# Dynamically create one task per S3 worker hostname
# IMPORTANT: The tasks are created when this DAG file is parsed by the Airflow Scheduler.
# If you add/change the 's3_worker_hostnames' Airflow Variable, you may need to
# wait a few minutes for the scheduler to re-parse the file and update the tasks.
# Forcing a re-parse can be done by pausing and un-pausing the DAG in the UI.
s3_worker_hostnames = [] # Initialize to be safe
try:
# The variable should be a JSON list of strings, e.g., ["s3-001", "s3-002"]
s3_worker_hostnames = Variable.get("s3_worker_hostnames", deserialize_json=True, default_var=[])
logger.info(f"DAG 'ytdlp_s3_uploader' successfully loaded s3_worker_hostnames variable. Value: {s3_worker_hostnames}")
if not isinstance(s3_worker_hostnames, list):
logger.error(f"Airflow Variable 's3_worker_hostnames' is not a valid JSON list. Value: {s3_worker_hostnames}")
s3_worker_hostnames = [] # Reset to empty to prevent errors
except Exception as e:
logger.error(
f"Could not read or parse Airflow Variable 's3_worker_hostnames'. "
f"Please create it in the Airflow UI as a JSON list of your S3 worker hostnames (e.g., [\"s3-001\"]). "
f"No S3 worker tasks will be created. Error: {e}",
exc_info=True
)
s3_worker_hostnames = []
@task(task_id='check_s3_worker_configuration')
def check_s3_worker_configuration_callable():
"""Logs the current value of the s3_worker_hostnames variable at runtime for debugging."""
logger.info("--- S3 Worker Configuration Check (at runtime) ---")
try:
hostnames = Variable.get("s3_worker_hostnames", deserialize_json=True, default_var=None)
if hostnames is None:
logger.error("Airflow Variable 's3_worker_hostnames' is not defined.")
logger.info("Please create it in the Airflow UI (Admin -> Variables) as a JSON list of strings, e.g., [\"s3-worker-01\"]")
elif not isinstance(hostnames, list):
logger.error(f"Airflow Variable 's3_worker_hostnames' is not a valid JSON list. Current value: {hostnames}")
elif not hostnames:
logger.warning("Airflow Variable 's3_worker_hostnames' is defined but is an empty list []. No worker tasks will be run.")
else:
logger.info(f"Successfully read 's3_worker_hostnames'. It contains {len(hostnames)} worker(s): {hostnames}")
logger.info("If you see this task but no worker tasks in the UI, it means the DAG did not find these workers when it was parsed by the scheduler.")
logger.info("This can happen due to caching. Please wait a few minutes for the scheduler to re-parse the DAG file, or pause/un-pause the DAG.")
except Exception as e:
logger.error(f"An error occurred while trying to read the 's3_worker_hostnames' variable at runtime: {e}", exc_info=True)
logger.info("--- End of Configuration Check ---")
check_s3_worker_configuration_task = check_s3_worker_configuration_callable()
check_s3_worker_configuration_task.doc_md = """
### S3 Worker Configuration Check
This task runs at the start of every DAG run to check the `s3_worker_hostnames` Airflow Variable.
The dynamic worker tasks are created based on this variable *at the time the DAG is parsed by the scheduler*.
**Check the logs for this task to see the current value of the variable as read at runtime.** This can help diagnose why worker tasks may not have been created.
If the logs show the variable is correct but you don't see the worker tasks in the UI, you may need to wait for the scheduler to re-parse the DAG file. You can force this by pausing and un-pausing the DAG.
"""
if s3_worker_hostnames:
worker_tasks = []
for hostname in s3_worker_hostnames:
# Sanitize hostname for task_id
task_id_hostname = hostname.replace('.', '_')
# Create a task for each worker, pinned to its specific queue
upload_task = task(
task_id=f'upload_batch_on_{task_id_hostname}',
queue=f'queue-s3-{hostname}'
)(run_s3_upload_batch)()
worker_tasks.append(upload_task)
check_s3_worker_configuration_task >> worker_tasks

File diff suppressed because it is too large Load Diff

View File

@ -8,3 +8,8 @@ vault_ss_password_2: "tgtQcfjJp/A3F01g4woO0bEQoxij3CAOK/iR1OTPuF4="
vault_dockerhub_password: "dckr_pat_DmFFqwFEdXFvZlgngGY9ooBaq6o"
vault_s3_access_key_id: "admin"
vault_s3_secret_access_key: "0153093693-0009"
vault_s3_delivery_access_key_id: "4d33e37e87c945718478e8003f6e93fb"
vault_s3_delivery_secret_access_key: "33b155c5d2ea4fccb0faeeefb420d7ac"
vault_s3_delivery_endpoint: "https://s3.rusonyxcloud.ru"
vault_s3_delivery_bucket: "videos"
vault_s3_delivery_aws_region: "ru-msk"

View File

@ -2,8 +2,8 @@
- name: Deploy Airflow DL Worker Stack
hosts: airflow_workers
vars_files:
- group_vars/all.yml
- group_vars/all/vault.yml
- "{{ inventory_dir }}/group_vars/all/vault.yml"
- "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
pre_tasks:
- name: Announce fast deploy mode if enabled
debug:
@ -17,13 +17,17 @@
path: "{{ airflow_worker_dir }}"
state: directory
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Template .env.worker
- name: Template .env file for worker
template:
src: templates/.env.worker.j2
src: templates/.env.j2
dest: "{{ airflow_worker_dir }}/.env"
mode: '0600'
vars:
service_role: "worker"
- name: Template docker-compose file for Airflow worker
template:
@ -31,6 +35,34 @@
dest: "{{ airflow_worker_dir }}/configs/docker-compose-dl.yaml"
mode: '0644'
- name: Ensure configs directory exists for config generator
file:
path: "{{ airflow_worker_dir }}/configs"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Sync config generator script
ansible.posix.synchronize:
src: ../airflow/generate_envoy_config.py
dest: "{{ airflow_worker_dir }}/"
archive: yes
rsync_path: "sudo rsync"
- name: Sync config generator templates
ansible.posix.synchronize:
src: ../airflow/configs/{{ item }}
dest: "{{ airflow_worker_dir }}/configs/"
archive: yes
rsync_path: "sudo rsync"
loop:
- docker-compose.config-generate.yaml
- envoy.yaml.j2
- docker-compose.camoufox.yaml.j2
- docker-compose-ytdlp-ops.yaml.j2
- name: Build Airflow worker image from local Dockerfile
community.docker.docker_image:
name: "{{ airflow_image_name }}"
@ -50,7 +82,7 @@
- name: Generate dynamic configs (camoufox + envoy)
shell:
cmd: "docker compose -f configs/docker-compose.config-generate.yaml run --rm config-generator"
cmd: "docker compose --project-directory . -f configs/docker-compose.config-generate.yaml run --rm config-generator"
chdir: "{{ airflow_worker_dir }}"
- name: Start worker services
@ -59,6 +91,7 @@
files:
- configs/docker-compose-dl.yaml
- configs/docker-compose-ytdlp-ops.yaml
- configs/docker-compose.camoufox.yaml
state: present
remove_orphans: true
pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"

View File

@ -2,14 +2,30 @@
- name: Deploy Airflow DL Worker Stack
hosts: airflow_workers
vars_files:
- group_vars/all.yml
- group_vars/all/vault.yml
- "{{ inventory_dir }}/group_vars/all/vault.yml"
- "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
pre_tasks:
- name: Announce fast deploy mode if enabled
debug:
msg: "🚀 FAST DEPLOY MODE ENABLED: Skipping Docker image builds and pulls. 🚀"
when: fast_deploy | default(false)
run_once: true
- name: Install python3-pip
ansible.builtin.apt:
name: python3-pip
state: present
become: yes
- name: Install required python packages for ytops-client on host
ansible.builtin.pip:
name:
- thrift
- aria2p
- PyYAML
state: present
extra_args: --break-system-packages
become: yes
tasks:
- name: Ensure worker directory exists
@ -17,13 +33,17 @@
path: "{{ airflow_worker_dir }}"
state: directory
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Template .env.worker
- name: Template .env file for worker
template:
src: templates/.env.worker.j2
src: templates/.env.j2
dest: "{{ airflow_worker_dir }}/.env"
mode: '0600'
vars:
service_role: "worker"
- name: Template docker-compose file for Airflow worker
template:
@ -31,6 +51,34 @@
dest: "{{ airflow_worker_dir }}/configs/docker-compose-dl.yaml"
mode: '0644'
- name: Ensure configs directory exists for config generator
file:
path: "{{ airflow_worker_dir }}/configs"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Sync config generator script
ansible.posix.synchronize:
src: ../airflow/generate_envoy_config.py
dest: "{{ airflow_worker_dir }}/"
archive: yes
rsync_path: "sudo rsync"
- name: Sync config generator templates
ansible.posix.synchronize:
src: ../airflow/configs/{{ item }}
dest: "{{ airflow_worker_dir }}/configs/"
archive: yes
rsync_path: "sudo rsync"
loop:
- docker-compose.config-generate.yaml
- envoy.yaml.j2
- docker-compose.camoufox.yaml.j2
- docker-compose-ytdlp-ops.yaml.j2
- name: Build Airflow worker image from local Dockerfile
community.docker.docker_image:
name: "{{ airflow_image_name }}"
@ -49,7 +97,7 @@
- name: Generate dynamic configs (camoufox + envoy)
shell:
cmd: "docker compose -f configs/docker-compose.config-generate.yaml run --rm config-generator"
cmd: "docker compose --project-directory . -f configs/docker-compose.config-generate.yaml run --rm config-generator"
chdir: "{{ airflow_worker_dir }}"
- name: Start worker services
@ -61,3 +109,22 @@
state: present
remove_orphans: true
pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"
- name: Stop camoufox services (deprecated)
community.docker.docker_compose_v2:
project_src: "{{ airflow_worker_dir }}"
files:
- configs/docker-compose.camoufox.yaml
state: absent
ignore_errors: true
- name: Update Airflow variable with S3 worker hostnames
ansible.builtin.command: >
docker compose exec -T airflow-scheduler
airflow variables set s3_worker_hostnames
'{{ groups["airflow_workers"] | map("regex_replace", "\\..*", "") | list | to_json }}'
args:
chdir: "{{ airflow_master_dir }}"
become: yes
delegate_to: "{{ groups['airflow_master'][0] }}"
run_once: true

View File

@ -189,6 +189,17 @@
- name: Restart Airflow worker on WORKER to apply hook
when: inventory_hostname in groups['airflow_workers']
ansible.builtin.command:
cmd: "docker compose restart airflow-worker-dl airflow-worker-auth"
cmd: "docker compose restart airflow-worker-dl airflow-worker-auth airflow-worker-mgmt airflow-worker-s3"
chdir: "{{ airflow_worker_dir }}"
become: yes
- name: Update Airflow variable with S3 worker hostnames
ansible.builtin.command: >
docker compose exec -T airflow-scheduler
airflow variables set s3_worker_hostnames
'{{ groups["airflow_workers"] | map("regex_replace", "\\..*", "") | list | to_json }}'
args:
chdir: "{{ airflow_master_dir }}"
become: yes
when: inventory_hostname in groups['airflow_master']
run_once: true

View File

@ -68,6 +68,6 @@
- name: Restart Airflow worker on WORKER
when: inventory_hostname in groups['airflow_workers']
ansible.builtin.command:
cmd: "docker compose restart airflow-worker-dl airflow-worker-auth"
cmd: "docker compose restart airflow-worker-dl airflow-worker-auth airflow-worker-mgmt airflow-worker-s3"
chdir: "{{ airflow_worker_dir }}"
become: yes

View File

@ -13,6 +13,22 @@
debug:
msg: "Starting deployment for Airflow Master: {{ inventory_hostname }} ({{ ansible_host }})"
- name: Install python3-pip
ansible.builtin.apt:
name: python3-pip
state: present
become: yes
- name: Install required python packages
ansible.builtin.pip:
name:
- thrift
- aria2p
- PyYAML
- apache-airflow-providers-amazon
state: present
become: yes
- name: Configure Redis memory overcommit setting
copy:
src: "configs/etc/sysctl.d/99-redis-overcommit.conf"
@ -156,6 +172,20 @@
mode: '0755'
become: yes
- name: Ensure runtime data directories exist with correct ownership
ansible.builtin.file:
path: "{{ airflow_master_dir }}/{{ item }}"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0775'
recurse: yes
loop:
- "downloadfiles"
- "inputfiles"
- "dumps"
become: yes
- name: Sync python packages to master for build context
ansible.posix.synchronize:
src: "../{{ item }}/"
@ -249,14 +279,58 @@
var: config_generator_result.stdout_lines
when: config_generator_result.changed
- name: Start ytdlp-ops services on master
community.docker.docker_compose_v2:
project_src: "{{ airflow_master_dir }}"
files:
- configs/docker-compose-ytdlp-ops.yaml
state: present
remove_orphans: true
pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"
roles:
- ytdlp-master
- airflow-master
post_tasks:
- name: Include camoufox verification tasks
include_tasks: tasks/verify_camoufox.yml
when: not fast_deploy | default(false)
- name: Delete existing Airflow redis_default connection to ensure an idempotent update
ansible.builtin.command: >
docker compose exec -T airflow-scheduler
airflow connections delete redis_default
args:
chdir: "{{ airflow_master_dir }}"
register: delete_redis_conn
retries: 5
delay: 10
until: delete_redis_conn.rc == 0 or 'not found' in delete_redis_conn.stderr
changed_when: "'was deleted successfully' in delete_redis_conn.stdout"
failed_when:
- delete_redis_conn.rc != 0
- "'not found' not in delete_redis_conn.stderr"
become: yes
become_user: "{{ ansible_user }}"
- name: Add Airflow redis_default connection
ansible.builtin.command: >
docker compose exec -T airflow-scheduler
airflow connections add redis_default
--conn-uri 'redis://:{{ vault_redis_password }}@{{ ansible_host }}:{{ redis_port }}/{{ redis_db_celery_broker | default(1) }}'
args:
chdir: "{{ airflow_master_dir }}"
register: add_redis_conn
retries: 5
delay: 10
until: add_redis_conn.rc == 0
changed_when: "'was successfully added' in add_redis_conn.stdout"
become: yes
become_user: "{{ ansible_user }}"
- name: Update S3 delivery connection
ansible.builtin.import_playbook: playbook-update-s3-vars.yml
# - name: Include camoufox verification tasks
# include_tasks: tasks/verify_camoufox.yml
# when: not fast_deploy | default(false)
- name: Run regression test
command: >

View File

@ -0,0 +1,58 @@
---
- name: Update S3 Delivery Airflow Connection
hosts: airflow_master
vars_files:
- "{{ inventory_dir }}/group_vars/all/vault.yml"
- "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
tasks:
- name: Delete existing s3_delivery_connection to ensure an idempotent update
ansible.builtin.command: >
docker compose exec -T airflow-scheduler
airflow connections delete s3_delivery_connection
args:
chdir: "{{ airflow_master_dir }}"
register: delete_s3_conn
retries: 5
delay: 10
until: delete_s3_conn.rc == 0 or 'Did not find a connection' in delete_s3_conn.stderr
changed_when: "'was deleted successfully' in delete_s3_conn.stdout"
failed_when:
- delete_s3_conn.rc != 0
- "'Did not find a connection' not in delete_s3_conn.stderr"
become: yes
become_user: "{{ ansible_user }}"
- name: Add/Update s3_delivery_connection
ansible.builtin.command:
argv:
- docker
- compose
- exec
- -T
- airflow-scheduler
- airflow
- connections
- add
- s3_delivery_connection
- --conn-type
- aws
- --conn-login
- "{{ vault_s3_delivery_access_key_id }}"
- --conn-password
- "{{ vault_s3_delivery_secret_access_key }}"
- --conn-host
- "{{ vault_s3_delivery_endpoint }}"
- --conn-extra
- "{{ s3_extra_dict | to_json }}"
chdir: "{{ airflow_master_dir }}"
vars:
s3_extra_dict:
bucket: "{{ vault_s3_delivery_bucket }}"
region_name: "{{ vault_s3_delivery_aws_region }}"
register: add_s3_conn
retries: 5
delay: 10
until: add_s3_conn.rc == 0
changed_when: "'was successfully added' in add_s3_conn.stdout"
become: yes
become_user: "{{ ansible_user }}"

View File

@ -147,6 +147,23 @@
mode: '0755'
become: yes
- name: Ensure runtime data directories exist with correct ownership
ansible.builtin.file:
path: "{{ airflow_worker_dir }}/{{ item }}"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0775'
recurse: yes
loop:
- "downloadfiles"
- "downloadfiles/videos"
- "downloadfiles/videos/in-progress"
- "downloadfiles/videos/ready"
- "inputfiles"
- "dumps"
become: yes
- name: Create .dockerignore on worker to exclude runtime data from build context
ansible.builtin.copy:
dest: "{{ airflow_worker_dir }}/.dockerignore"

View File

@ -57,8 +57,8 @@ YTDLP_TIMEOUT=600
# --- Camoufox (Browser) Configuration ---
CAMOUFOX_PROXIES="{{ (worker_proxies | default([])) | join(',') }}"
VNC_PASSWORD="{{ vault_vnc_password }}"
CAMOUFOX_BASE_VNC_PORT={{ camoufox_base_vnc_port }}
CAMOUFOX_PORT={{ camoufox_base_port }}
CAMOUFOX_BASE_VNC_PORT={{ camoufox_base_vnc_port | default(5901) }}
CAMOUFOX_PORT={{ camoufox_base_port | default(9070) }}
# --- Account Manager Configuration ---
ACCOUNT_ACTIVE_DURATION_MIN={{ account_active_duration_min | default(7) }}

View File

@ -18,8 +18,9 @@
# Retry fragments 10 times
--fragment-retries 10
# Limit download rate to 5M
--limit-rate 5M
# Use a fixed buffer size to stabilize throughput and avoid traffic shaping
--no-resize-buffer
--buffer-size 4M
# Socket timeout
--socket-timeout 15
@ -31,5 +32,10 @@
# Progress
--progress
# Merge to mp4 by default
--merge-output-format mp4
# Don't use "NA" in filenames if metadata is missing
--output-na-placeholder ""
--no-part

View File

@ -0,0 +1,58 @@
# This file contains custom policies for specific testing scenarios.
---
# Policy: Fetch info.json with visitor ID rotation.
# This policy uses a single worker to fetch info.json files for a list of URLs.
# It simulates user churn by creating a new profile (and thus a new visitor_id and POT)
# every 250 requests. A short sleep is used between requests.
name: fetch_with_visitor_id_rotation
settings:
mode: fetch_only
urls_file: "urls.txt" # Placeholder, should be overridden with --set
info_json_script: "bin/ytops-client get-info"
save_info_json_dir: "fetched_info_jsons/visitor_id_rotation"
# Use the modern profile management system to rotate visitor_id.
profile_mode: per_worker_with_rotation
profile_management:
prefix: "visitor_rotator"
# Rotate to a new profile generation after 250 requests.
max_requests_per_profile: 250
execution_control:
run_until: { cycles: 1 } # Run through the URL list once.
workers: 1 # Run with a single worker thread.
# A short, fixed sleep between each info.json request.
sleep_between_tasks: { min_seconds: 0.75, max_seconds: 0.75 }
info_json_generation_policy:
# Use a standard client. The server will handle token generation.
client: web
---
# Policy: Test download specific DASH formats from a folder of info.jsons.
# This policy uses a single worker to test-download a list of video-only DASH
# formats from a directory of existing info.json files. It only downloads the
# first 10KB of each format and sleeps between each file.
name: download_dashy_formats_test
settings:
mode: download_only
# Directory of info.json files to process.
info_json_dir: "fetched_info_jsons/visitor_id_rotation" # Assumes output from the above policy
execution_control:
run_until: { cycles: 1 } # Run through the info.json directory once.
workers: 1 # Run with a single worker thread.
# A longer, randomized sleep between processing each info.json file.
sleep_between_tasks: { min_seconds: 5, max_seconds: 10 }
download_policy:
# A specific list of video-only DASH formats to test.
# The "-dashy" suffix is illustrative; the format IDs must exist in the info.json.
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy"
# Use the native Python downloader for better performance and control.
downloader: "native-py"
# Pass extra arguments to yt-dlp to perform a "test" download (first 10KB).
extra_args: '--download-sections "*0-10240"'
output_dir: "downloads/dash_test"

View File

@ -1,2 +1 @@
__py_cache__
target/

View File

@ -179,13 +179,63 @@ def main_download_native_py(args):
ydl_opts = {}
if base_opts_args:
try:
# This is an internal API, but it's the most accurate way to parse CLI args
# into the ydl_opts dictionary format.
# yt-dlp's parse_options can return 3 or 4 values. We only need the `opts` namespace (second value).
_parser, opts, _args, *_ = yt_dlp.parse_options(base_opts_args)
ydl_opts = vars(opts)
logger.info(f"Parsing {len(base_opts_args)} arguments from config/extra_args...")
i = 0
while i < len(base_opts_args):
arg = base_opts_args[i]
if not arg.startswith('--'):
logger.warning(f"Skipping non-option argument in extra args: {arg}")
i += 1
continue
key = arg.lstrip('-').replace('-', '_')
# Handle flags (no value)
is_flag = i + 1 >= len(base_opts_args) or base_opts_args[i + 1].startswith('--')
if is_flag:
if key.startswith('no_'):
# Handle --no-foo flags
ydl_opts[key[3:]] = False
else:
ydl_opts[key] = True
logger.debug(f"Parsed flag: {key} = {ydl_opts.get(key[3:] if key.startswith('no_') else key)}")
i += 1
# Handle options with values
else:
value = base_opts_args[i + 1]
# Try to convert values to numbers, which yt-dlp expects.
# This includes parsing byte suffixes like 'K', 'M', 'G'.
if isinstance(value, str):
original_value = value
value_upper = value.upper()
multipliers = {'K': 1024, 'M': 1024**2, 'G': 1024**3, 'T': 1024**4}
if value_upper and value_upper[-1] in multipliers:
try:
num = float(value[:-1])
value = int(num * multipliers[value_upper[-1]])
except (ValueError, TypeError):
value = original_value # fallback
else:
try:
value = int(value)
except (ValueError, TypeError):
try:
value = float(value)
except (ValueError, TypeError):
value = original_value # fallback
# Special handling for keys that differ from CLI arg, e.g. --limit-rate -> ratelimit
if key == 'limit_rate':
key = 'ratelimit'
ydl_opts[key] = value
logger.debug(f"Parsed option: {key} = {value}")
i += 2
logger.info("Successfully parsed extra yt-dlp options.")
except Exception as e:
logger.error(f"Failed to parse options from config/extra_args: {e}")
logger.error(f"Failed to parse options from config/extra_args: {e}", exc_info=True)
return 1
# Now, layer the script's explicit arguments on top, as they have higher precedence.