Added changes on s3 uploaded and ansible changes tested on af-test machines, deprecation of camoufox run
This commit is contained in:
parent
3709ba6f81
commit
61873b46f9
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,2 +1,2 @@
|
||||
**/__pycache__/*
|
||||
.aider*
|
||||
__pycache__
|
||||
|
||||
@ -50,6 +50,12 @@ RUN FFMPEG_URL="https://github.com/yt-dlp/FFmpeg-Builds/releases/download/latest
|
||||
ln -sf /opt/ffmpeg/bin/ffprobe /usr/local/bin/ffprobe && \
|
||||
rm -rf /tmp/ffmpeg.tar.xz
|
||||
|
||||
# Install s5cmd
|
||||
RUN S5CMD_URL="https://github.com/peak/s5cmd/releases/download/v2.3.0/s5cmd_2.3.0_linux_amd64.deb" && \
|
||||
wget -qO /tmp/s5cmd.deb "$S5CMD_URL" && \
|
||||
dpkg -i /tmp/s5cmd.deb && \
|
||||
rm /tmp/s5cmd.deb
|
||||
|
||||
# Install yt-dlp from master
|
||||
# Temporarily rename pip to bypass the root check in the base image's pip wrapper,
|
||||
# ensuring a system-wide installation.
|
||||
@ -72,7 +78,7 @@ RUN wget -q https://github.com/ginuerzh/gost/releases/download/v2.12.0/gost_2.12
|
||||
rm gost_2.12.0_linux_amd64.tar.gz
|
||||
|
||||
# Verify installations
|
||||
RUN ffmpeg -version && deno --version && yt-dlp --version && aria2c --version && gost -V
|
||||
RUN ffmpeg -version && deno --version && yt-dlp --version && aria2c --version && gost -V && s5cmd version
|
||||
|
||||
# Create version information files
|
||||
RUN ( \
|
||||
@ -107,7 +113,8 @@ RUN mv /usr/local/bin/pip /usr/local/bin/pip.orig && \
|
||||
"ffprobe3" \
|
||||
"python-dotenv" \
|
||||
"PyYAML" \
|
||||
"aria2p" && \
|
||||
"aria2p" \
|
||||
"s5cmdpy" && \
|
||||
mv /usr/local/bin/pip.orig /usr/local/bin/pip
|
||||
|
||||
# --- Install the custom yt_ops_services package ---
|
||||
@ -141,6 +148,7 @@ RUN mv /usr/local/bin/pip /usr/local/bin/pip.orig && \
|
||||
# This fixes permission issues that can occur if previous RUN commands created files in /home/airflow as root.
|
||||
# We also make it world-writable to accommodate running the container with a different user ID, which can
|
||||
# happen in some environments (e.g., OpenShift or with docker-compose user overrides).
|
||||
RUN mkdir -p /home/airflow/.aws && chown -R airflow:airflow /home/airflow/.aws
|
||||
RUN chown -R airflow:airflow /home/airflow && chmod -R 777 /home/airflow
|
||||
|
||||
# Switch to airflow user for all subsequent operations
|
||||
|
||||
@ -54,8 +54,8 @@ x-airflow-common:
|
||||
# Remote Logging - connection is configured directly via environment variables
|
||||
#_PIP_ADDITIONAL_REQUIREMENTS: ${{ '{' }}_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker apache-airflow-providers-http thrift>=0.16.0,<=0.20.0 backoff>=2.2.1 python-dotenv==1.0.1 psutil>=5.9.0 apache-airflow-providers-amazon{{ '}' }}
|
||||
AIRFLOW__LOGGING__REMOTE_LOGGING: "True"
|
||||
AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs"
|
||||
AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default
|
||||
AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://videos/airflow-logs"
|
||||
AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: s3_delivery_connection
|
||||
AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False"
|
||||
#AIRFLOW__LOGGING__LOG_ID_TEMPLATE: "{dag_id}-{task_id}-{run_id}-{try_number}"
|
||||
AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
|
||||
@ -121,6 +121,51 @@ services:
|
||||
- proxynet
|
||||
restart: always
|
||||
|
||||
airflow-worker-s3:
|
||||
<<: *airflow-common
|
||||
container_name: airflow-worker-s3-1
|
||||
hostname: ${HOSTNAME:-s3-001}
|
||||
# The S3 worker listens on the generic s3 queue AND its own dedicated queue.
|
||||
command: airflow celery worker -q queue-s3,queue-s3-${HOSTNAME:-s3-001}
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: ${AIRFLOW_WORKER_S3_MEM_LIMIT:-1G}
|
||||
reservations:
|
||||
memory: ${AIRFLOW_WORKER_S3_MEM_RESERV:-256M}
|
||||
healthcheck:
|
||||
test:
|
||||
- "CMD-SHELL"
|
||||
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-s3@$$(hostname)"'
|
||||
interval: 30s
|
||||
timeout: 30s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
environment:
|
||||
<<: *airflow-common-env
|
||||
S3_DELIVERY_AWS_ACCESS_KEY_ID: "{{ vault_s3_delivery_access_key_id }}"
|
||||
S3_DELIVERY_AWS_SECRET_ACCESS_KEY: "{{ vault_s3_delivery_secret_access_key }}"
|
||||
S3_DELIVERY_AWS_REGION: "{{ vault_s3_delivery_aws_region }}"
|
||||
S3_DELIVERY_ENDPOINT: "{{ vault_s3_delivery_endpoint }}"
|
||||
S3_DELIVERY_BUCKET: "{{ vault_s3_delivery_bucket }}"
|
||||
HOSTNAME: ${HOSTNAME:-s3-001}
|
||||
DUMB_INIT_SETSID: "0"
|
||||
AIRFLOW__CELERY__WORKER_QUEUES: "queue-s3,queue-s3-${HOSTNAME:-s3-001}"
|
||||
AIRFLOW__CELERY__WORKER_TAGS: "s3"
|
||||
AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
|
||||
# S3 tasks are lightweight.
|
||||
AIRFLOW__CELERY__WORKER_AUTOSCALE: "2,1"
|
||||
AIRFLOW__CELERY__POOL: "prefork"
|
||||
AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
|
||||
AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
|
||||
AIRFLOW__CELERY__WORKER_NAME: "worker-s3@%h"
|
||||
AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
|
||||
AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "262144" # 256MB
|
||||
networks:
|
||||
- default
|
||||
- proxynet
|
||||
restart: always
|
||||
|
||||
airflow-worker-auth:
|
||||
<<: *airflow-common
|
||||
container_name: airflow-worker-auth-1
|
||||
@ -175,6 +220,46 @@ services:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
restart: always
|
||||
|
||||
airflow-worker-mgmt:
|
||||
<<: *airflow-common
|
||||
container_name: airflow-worker-mgmt-1
|
||||
hostname: ${HOSTNAME:-mgmt001}
|
||||
# The Mgmt worker listens on the generic mgmt queue AND its own dedicated queue.
|
||||
command: airflow celery worker -q queue-mgmt,queue-mgmt-${HOSTNAME:-mgmt001}
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: ${AIRFLOW_WORKER_MGMT_MEM_LIMIT:-2G}
|
||||
reservations:
|
||||
memory: ${AIRFLOW_WORKER_MGMT_MEM_RESERV:-512M}
|
||||
healthcheck:
|
||||
test:
|
||||
- "CMD-SHELL"
|
||||
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-mgmt@$$(hostname)"'
|
||||
interval: 30s
|
||||
timeout: 30s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
environment:
|
||||
<<: *airflow-common-env
|
||||
HOSTNAME: ${HOSTNAME:-mgmt001}
|
||||
DUMB_INIT_SETSID: "0"
|
||||
AIRFLOW__CELERY__WORKER_QUEUES: "queue-mgmt,queue-mgmt-${HOSTNAME:-mgmt001}"
|
||||
AIRFLOW__CELERY__WORKER_TAGS: "mgmt"
|
||||
AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
|
||||
# Mgmt tasks are lightweight.
|
||||
AIRFLOW__CELERY__WORKER_AUTOSCALE: "4,2"
|
||||
AIRFLOW__CELERY__POOL: "prefork"
|
||||
AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
|
||||
AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
|
||||
AIRFLOW__CELERY__WORKER_NAME: "worker-mgmt@%h"
|
||||
AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
|
||||
AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "262144" # 256MB
|
||||
networks:
|
||||
- default
|
||||
- proxynet
|
||||
restart: always
|
||||
|
||||
networks:
|
||||
proxynet:
|
||||
name: airflow_proxynet
|
||||
|
||||
@ -1,12 +1,4 @@
|
||||
name: ytdlp-ops
|
||||
{% if service_role is defined and service_role != 'management' %}
|
||||
include:
|
||||
# This automatically includes the generated camoufox service definitions and dependencies.
|
||||
# It simplifies the docker-compose command, as you no longer need to specify both files with -f.
|
||||
# The file is generated by the config-generator service and will be created even if empty.
|
||||
- ./configs/docker-compose.camoufox.yaml
|
||||
{% endif %}
|
||||
|
||||
services:
|
||||
bgutil-provider:
|
||||
image: brainicism/bgutil-ytdlp-pot-provider
|
||||
@ -66,10 +58,6 @@ services:
|
||||
depends_on:
|
||||
context-prepper:
|
||||
condition: service_completed_successfully
|
||||
{% if service_role is defined and service_role != 'management' %}
|
||||
camoufox-group:
|
||||
condition: service_started
|
||||
{% endif %}
|
||||
# Ports are no longer exposed directly. Envoy will connect to them on the internal network.
|
||||
# entrypoint:
|
||||
# - /bin/sh
|
||||
|
||||
@ -54,22 +54,6 @@ services:
|
||||
- proxynet
|
||||
{% endfor %}
|
||||
|
||||
{% if camoufox_proxies %}
|
||||
# This service is a dependency anchor. The main services depend on it,
|
||||
# and it in turn depends on all camoufox instances.
|
||||
camoufox-group:
|
||||
image: alpine:latest
|
||||
command: ["echo", "Camoufox group ready."]
|
||||
restart: "no"
|
||||
depends_on:
|
||||
{% for proxy in camoufox_proxies %}
|
||||
{% set proxy_port = _get_port_from_proxy_url(proxy.url) | int %}
|
||||
- camoufox-{{ proxy_port }}-{{ loop.index }}
|
||||
{% endfor %}
|
||||
networks:
|
||||
- proxynet
|
||||
{% endif %}
|
||||
|
||||
volumes:
|
||||
{% for proxy in camoufox_proxies %}
|
||||
{% set proxy_port = _get_port_from_proxy_url(proxy.url) | int %}
|
||||
|
||||
@ -788,6 +788,7 @@ def manage_system_callable(**context):
|
||||
|
||||
with DAG(
|
||||
dag_id="ytdlp_mgmt_proxy_account",
|
||||
default_args={"queue": "queue-mgmt"},
|
||||
start_date=days_ago(1),
|
||||
schedule=None,
|
||||
catchup=False,
|
||||
|
||||
@ -591,6 +591,7 @@ with DAG(
|
||||
"owner": "airflow",
|
||||
"start_date": days_ago(1),
|
||||
"retries": 0,
|
||||
"queue": "queue-mgmt",
|
||||
},
|
||||
schedule=None,
|
||||
catchup=False,
|
||||
|
||||
@ -27,7 +27,7 @@ from airflow.utils.dates import days_ago
|
||||
|
||||
# Import utility functions and Thrift modules
|
||||
from utils.redis_utils import _get_redis_client
|
||||
from pangramia.yt.tokens_ops import YTTokenOpService
|
||||
from pangramia.yt.management import YTManagementService
|
||||
from thrift.protocol import TBinaryProtocol
|
||||
from thrift.transport import TSocket, TTransport
|
||||
|
||||
@ -36,14 +36,14 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
# Default settings from Airflow Variables or hardcoded fallbacks
|
||||
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
||||
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
|
||||
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9980)
|
||||
DEFAULT_MANAGEMENT_SERVICE_IP = Variable.get("MANAGEMENT_SERVICE_HOST", default_var="172.17.0.1")
|
||||
DEFAULT_MANAGEMENT_SERVICE_PORT = Variable.get("MANAGEMENT_SERVICE_PORT", default_var=9080)
|
||||
|
||||
DEFAULT_ARGS = {
|
||||
'owner': 'airflow',
|
||||
'retries': 1,
|
||||
'retry_delay': 30,
|
||||
'queue': 'default',
|
||||
'queue': 'queue-mgmt',
|
||||
}
|
||||
|
||||
|
||||
@ -55,7 +55,7 @@ def _get_thrift_client(host, port, timeout=60):
|
||||
transport.setTimeout(timeout * 1000)
|
||||
transport = TTransport.TFramedTransport(transport)
|
||||
protocol = TBinaryProtocol.TBinaryProtocol(transport)
|
||||
client = YTTokenOpService.Client(protocol)
|
||||
client = YTManagementService.Client(protocol)
|
||||
transport.open()
|
||||
logger.info(f"Connected to Thrift server at {host}:{port}")
|
||||
return client, transport
|
||||
@ -72,8 +72,8 @@ def manage_account_states(**context):
|
||||
cooldown_duration_s = params['account_cooldown_duration_min'] * 60
|
||||
ban_duration_s = params['account_ban_duration_hours'] * 3600
|
||||
|
||||
host = DEFAULT_YT_AUTH_SERVICE_IP
|
||||
port = int(DEFAULT_YT_AUTH_SERVICE_PORT)
|
||||
host = DEFAULT_MANAGEMENT_SERVICE_IP
|
||||
port = int(DEFAULT_MANAGEMENT_SERVICE_PORT)
|
||||
redis_conn_id = DEFAULT_REDIS_CONN_ID
|
||||
logger.info(f"Starting account maintenance. Service: {host}:{port}, Redis: {redis_conn_id}")
|
||||
logger.info(f"Using limits: Requests={requests_limit}, Cooldown={params['account_cooldown_duration_min']}m, Ban={params['account_ban_duration_hours']}h")
|
||||
@ -230,8 +230,8 @@ with DAG(
|
||||
This process gives full control over time-based account lifecycle management to the Airflow orchestrator.
|
||||
""",
|
||||
params={
|
||||
'account_requests_limit': Param(250, type="integer", description="Number of successful requests an account can make before it is rested."),
|
||||
'account_cooldown_duration_min': Param(60, type="integer", description="Duration in minutes an account must rest before being activated again. Default is 1 hour."),
|
||||
'account_requests_limit': Param(250, type="integer", description="Number of successful requests an account can make before it is rested. Default is 250."),
|
||||
'account_cooldown_duration_min': Param(60, type="integer", description="Duration in minutes an account must rest ('pause') before being activated again. Default is 60 minutes (1 hour)."),
|
||||
'account_ban_duration_hours': Param(24, type="integer", description="Duration in hours an account stays banned before it can be un-banned."),
|
||||
}
|
||||
) as dag:
|
||||
|
||||
@ -411,6 +411,7 @@ with DAG(
|
||||
orchestrate_task = PythonOperator(
|
||||
task_id='start_worker_loops',
|
||||
python_callable=orchestrate_workers_ignition_callable,
|
||||
queue='queue-mgmt',
|
||||
)
|
||||
orchestrate_task.doc_md = """
|
||||
### Start Worker Loops
|
||||
|
||||
@ -37,6 +37,7 @@ import socket
|
||||
import time
|
||||
import traceback
|
||||
import uuid
|
||||
import shutil
|
||||
|
||||
# Import utility functions and Thrift modules
|
||||
from utils.redis_utils import _get_redis_client
|
||||
@ -140,9 +141,14 @@ def _get_thrift_client(host, port, timeout):
|
||||
return client, transport
|
||||
|
||||
def _extract_video_id(url):
|
||||
"""Extracts YouTube video ID from URL."""
|
||||
"""Extracts YouTube video ID from a URL or returns the input if it's already a valid ID."""
|
||||
if not url or not isinstance(url, str):
|
||||
return None
|
||||
|
||||
# Check if the input is already a valid 11-character video ID
|
||||
if re.fullmatch(r'[a-zA-Z0-9_-]{11}', url):
|
||||
return url
|
||||
|
||||
patterns = [r'v=([a-zA-Z0-9_-]{11})', r'youtu\.be/([a-zA-Z0-9_-]{11})']
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
@ -299,7 +305,7 @@ def get_token(initial_data: dict, **context):
|
||||
|
||||
account_id = initial_data['account_id']
|
||||
url = initial_data['url_to_process']
|
||||
info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')
|
||||
info_json_dir = os.path.join(Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles'), 'videos', 'in-progress')
|
||||
|
||||
host, port = params['service_ip'], int(params['service_port'])
|
||||
machine_id = params.get('machine_id') or socket.gethostname()
|
||||
@ -308,9 +314,11 @@ def get_token(initial_data: dict, **context):
|
||||
assigned_proxy_url = params.get('assigned_proxy_url')
|
||||
|
||||
video_id = _extract_video_id(url)
|
||||
os.makedirs(info_json_dir, exist_ok=True)
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
info_json_path = os.path.join(info_json_dir, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json")
|
||||
job_dir_name = f"{timestamp}-{video_id or 'unknown'}"
|
||||
job_dir_path = os.path.join(info_json_dir, job_dir_name)
|
||||
os.makedirs(job_dir_path, exist_ok=True)
|
||||
info_json_path = os.path.join(job_dir_path, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json")
|
||||
|
||||
cmd = [
|
||||
'ytops-client', 'get-info',
|
||||
@ -375,6 +383,7 @@ def get_token(initial_data: dict, **context):
|
||||
|
||||
return {
|
||||
'info_json_path': info_json_path,
|
||||
'job_dir_path': job_dir_path,
|
||||
'socks_proxy': proxy,
|
||||
'ytdlp_command': None,
|
||||
'successful_account_id': account_id,
|
||||
@ -653,7 +662,10 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
info_json_path = token_data.get('info_json_path')
|
||||
proxy = token_data.get('socks_proxy')
|
||||
original_url = token_data.get('original_url')
|
||||
download_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles/video')
|
||||
download_dir = token_data.get('job_dir_path')
|
||||
if not download_dir:
|
||||
# Fallback for older runs or if job_dir_path is missing
|
||||
download_dir = os.path.dirname(info_json_path)
|
||||
|
||||
format_preset = params.get('download_format_preset', 'best_audio')
|
||||
if format_preset == 'custom':
|
||||
@ -678,6 +690,21 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
if not (info_json_path and os.path.exists(info_json_path)):
|
||||
raise AirflowException(f"Error: info.json path is missing or file does not exist ({info_json_path}).")
|
||||
|
||||
# WORKAROUND: The auth service may inject a 'js_runtimes' key into the info.json
|
||||
# that is incompatible with the yt-dlp library's expectations, causing a crash.
|
||||
# We remove it here before passing it to the download tool.
|
||||
try:
|
||||
with open(info_json_path, 'r+', encoding='utf-8') as f:
|
||||
info_data = json.load(f)
|
||||
if 'js_runtimes' in info_data:
|
||||
logger.info("Found 'js_runtimes' key in info.json. Removing it as a workaround for yt-dlp library incompatibility.")
|
||||
del info_data['js_runtimes']
|
||||
f.seek(0)
|
||||
json.dump(info_data, f)
|
||||
f.truncate()
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not process/remove 'js_runtimes' from info.json: {e}", exc_info=True)
|
||||
|
||||
def run_yt_dlp_command(format_selector: str):
|
||||
"""Constructs and runs a yt-ops-client download command, returning a list of final filenames."""
|
||||
downloader = params.get('downloader', 'py')
|
||||
@ -690,20 +717,26 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
cmd.extend(['--output-dir', download_dir])
|
||||
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
|
||||
|
||||
# WORKAROUND: Due to an incompatibility between ytops-client and a recent yt-dlp
|
||||
# library update, passing --extra-ytdlp-args to the 'py' downloader causes a crash.
|
||||
# These arguments are being omitted until ytops-client is fixed.
|
||||
# This affects: fragment_retries, limit_rate, socket_timeout, sleep_interval,
|
||||
# max_sleep_interval, yt_dlp_test_mode, and the 'yt_dlp_extra_args' DAG param.
|
||||
has_extra_args = (
|
||||
params.get('fragment_retries') or params.get('limit_rate') or
|
||||
params.get('socket_timeout') or params.get('min_sleep_interval') or
|
||||
params.get('max_sleep_interval') or params.get('yt_dlp_test_mode') or
|
||||
params.get('yt_dlp_extra_args')
|
||||
)
|
||||
if has_extra_args:
|
||||
logger.warning("WORKAROUND: Omitting --extra-ytdlp-args for 'py' downloader due to a known incompatibility. "
|
||||
"Some download parameters will be ignored.")
|
||||
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
|
||||
py_extra_args = []
|
||||
if params.get('fragment_retries'):
|
||||
py_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])])
|
||||
if params.get('limit_rate'):
|
||||
py_extra_args.extend(['--limit-rate', params['limit_rate']])
|
||||
if params.get('socket_timeout'):
|
||||
py_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
|
||||
if params.get('min_sleep_interval'):
|
||||
py_extra_args.extend(['--sleep-interval', str(params['min_sleep_interval'])])
|
||||
if params.get('max_sleep_interval'):
|
||||
py_extra_args.extend(['--max-sleep-interval', str(params['max_sleep_interval'])])
|
||||
if params.get('yt_dlp_test_mode'):
|
||||
py_extra_args.append('--test')
|
||||
|
||||
existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '')
|
||||
final_extra_args_list = existing_extra + py_extra_args
|
||||
if final_extra_args_list:
|
||||
final_extra_args_str = shlex.join(final_extra_args_list)
|
||||
cmd.extend(['--extra-ytdlp-args', final_extra_args_str])
|
||||
|
||||
elif downloader == 'aria-rpc':
|
||||
cmd.extend([
|
||||
@ -744,7 +777,10 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)])
|
||||
|
||||
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
|
||||
logger.info(f"Executing download command for format '{format_selector}': {copy_paste_cmd}")
|
||||
logger.info(f"--- Preparing to execute ytops-client ---")
|
||||
logger.info(f"Full ytops-client command for format '{format_selector}':")
|
||||
logger.info(copy_paste_cmd)
|
||||
logger.info(f"-----------------------------------------")
|
||||
process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
|
||||
|
||||
if process.stdout:
|
||||
@ -855,9 +891,20 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
final_formats_to_download = formats_to_download_initial
|
||||
else:
|
||||
for selector in formats_to_download_initial:
|
||||
# A selector can be '140' or '299/298/137'
|
||||
# A selector can be '140' or '299/298/137' or '140-dashy'
|
||||
individual_ids = re.split(r'[/+]', selector)
|
||||
if any(fid in available_formats for fid in individual_ids):
|
||||
|
||||
# Extract the numeric part of the format ID for checking against available_formats
|
||||
is_available = False
|
||||
for fid in individual_ids:
|
||||
numeric_id_match = re.match(r'^\d+', fid)
|
||||
if numeric_id_match:
|
||||
numeric_id = numeric_id_match.group(0)
|
||||
if numeric_id in available_formats:
|
||||
is_available = True
|
||||
break # Found a match, no need to check other parts of the selector
|
||||
|
||||
if is_available:
|
||||
final_formats_to_download.append(selector)
|
||||
else:
|
||||
logger.warning(f"Requested format selector '{selector}' contains no available formats. Skipping.")
|
||||
@ -872,54 +919,51 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
logger.info(f"Test mode: yt-dlp returned {len(successful_files)} filenames. Skipping probe failure checks.")
|
||||
if not successful_files:
|
||||
raise AirflowException("Test run did not produce any filenames.")
|
||||
return successful_files
|
||||
# Do not return here. Proceed to the cleanup and move logic.
|
||||
|
||||
if not failed_files:
|
||||
if not successful_files:
|
||||
raise AirflowException("Download and probe process completed but produced no valid files.")
|
||||
return successful_files
|
||||
final_success_list = successful_files
|
||||
if failed_files:
|
||||
# --- Handle Probe Failures and Retry ---
|
||||
if not retry_on_probe_failure:
|
||||
raise AirflowException(f"Probe failed for {len(failed_files)} file(s) and retry is disabled: {failed_files}")
|
||||
|
||||
# --- Handle Probe Failures and Retry ---
|
||||
if not retry_on_probe_failure:
|
||||
raise AirflowException(f"Probe failed for {len(failed_files)} file(s) and retry is disabled: {failed_files}")
|
||||
logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...")
|
||||
|
||||
logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...")
|
||||
format_ids_to_retry = []
|
||||
# Since each download is now for a specific selector and the output template
|
||||
# includes the format_id, we can always attempt to extract the format_id
|
||||
# from the failed filename for a targeted retry.
|
||||
for f in failed_files:
|
||||
match = re.search(r'\.f([\d]+)\.', f)
|
||||
if match:
|
||||
format_ids_to_retry.append(match.group(1))
|
||||
else:
|
||||
logger.error(f"Could not extract format_id from failed file '{f}'. Cannot retry this specific file.")
|
||||
formats_to_download_retry = format_ids_to_retry
|
||||
|
||||
format_ids_to_retry = []
|
||||
# Since each download is now for a specific selector and the output template
|
||||
# includes the format_id, we can always attempt to extract the format_id
|
||||
# from the failed filename for a targeted retry.
|
||||
for f in failed_files:
|
||||
match = re.search(r'\.f([\d]+)\.', f)
|
||||
if match:
|
||||
format_ids_to_retry.append(match.group(1))
|
||||
else:
|
||||
logger.error(f"Could not extract format_id from failed file '{f}'. Cannot retry this specific file.")
|
||||
formats_to_download_retry = format_ids_to_retry
|
||||
if not formats_to_download_retry:
|
||||
raise AirflowException("Probe failed, but could not determine which formats to retry.")
|
||||
|
||||
if not formats_to_download_retry:
|
||||
raise AirflowException("Probe failed, but could not determine which formats to retry.")
|
||||
# Rename failed files to allow for a fresh download attempt
|
||||
for f in failed_files:
|
||||
try:
|
||||
failed_path = f"{f}.probe_failed_{int(time.time())}"
|
||||
os.rename(f, failed_path)
|
||||
logger.info(f"Renamed corrupted file to {failed_path}")
|
||||
except OSError as rename_err:
|
||||
logger.error(f"Could not rename corrupted file '{f}': {rename_err}")
|
||||
|
||||
# Rename failed files to allow for a fresh download attempt
|
||||
for f in failed_files:
|
||||
try:
|
||||
failed_path = f"{f}.probe_failed_{int(time.time())}"
|
||||
os.rename(f, failed_path)
|
||||
logger.info(f"Renamed corrupted file to {failed_path}")
|
||||
except OSError as rename_err:
|
||||
logger.error(f"Could not rename corrupted file '{f}': {rename_err}")
|
||||
# --- Retry Download and Probe ---
|
||||
retried_successful_files, retried_failed_files = _download_and_probe_formats(formats_to_download_retry)
|
||||
|
||||
# --- Retry Download and Probe ---
|
||||
retried_successful_files, retried_failed_files = _download_and_probe_formats(formats_to_download_retry)
|
||||
if retried_failed_files:
|
||||
logger.error(f"Probe failed again for {len(retried_failed_files)} file(s) after retry: {retried_failed_files}")
|
||||
|
||||
if retried_failed_files:
|
||||
logger.error(f"Probe failed again for {len(retried_failed_files)} file(s) after retry: {retried_failed_files}")
|
||||
final_success_list = successful_files + retried_successful_files
|
||||
logger.info(f"Retry complete. Final success count: {len(final_success_list)} file(s).")
|
||||
|
||||
final_success_list = successful_files + retried_successful_files
|
||||
if not final_success_list:
|
||||
raise AirflowException("All files failed to download or probe correctly, even after retry.")
|
||||
|
||||
logger.info(f"Retry complete. Final success count: {len(final_success_list)} file(s).")
|
||||
raise AirflowException("Download and probe process completed but produced no valid files.")
|
||||
|
||||
if params.get('yt_dlp_cleanup_mode', True):
|
||||
logger.info(f"Cleanup mode is enabled. Creating .empty files and deleting originals for {len(final_success_list)} files.")
|
||||
@ -935,6 +979,35 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
logger.error(f"Error during cleanup for file {f}: {e}", exc_info=True)
|
||||
# Do not fail the task for a cleanup error, just log it.
|
||||
|
||||
# --- Move completed job directory to final destination ---
|
||||
try:
|
||||
video_id = _extract_video_id(original_url)
|
||||
if not video_id:
|
||||
logger.error(f"Could not extract video_id from URL '{original_url}' for final move. Skipping.")
|
||||
else:
|
||||
source_dir = download_dir # This is the job_dir_path
|
||||
|
||||
# Group downloads into 10-minute batch folders based on completion time.
|
||||
now = datetime.now()
|
||||
rounded_minute = (now.minute // 10) * 10
|
||||
timestamp_str = now.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}"
|
||||
|
||||
final_dir_base = os.path.join(Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles'), 'videos', 'ready', timestamp_str)
|
||||
final_dir_path = os.path.join(final_dir_base, video_id)
|
||||
|
||||
os.makedirs(final_dir_base, exist_ok=True)
|
||||
|
||||
logger.info(f"Moving completed job from '{source_dir}' to final destination '{final_dir_path}'")
|
||||
if os.path.exists(final_dir_path):
|
||||
logger.warning(f"Destination '{final_dir_path}' already exists. It will be removed and replaced.")
|
||||
shutil.rmtree(final_dir_path)
|
||||
|
||||
shutil.move(source_dir, final_dir_path)
|
||||
logger.info(f"Successfully moved job to '{final_dir_path}'.")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to move completed job directory: {e}", exc_info=True)
|
||||
# Do not fail the task for a move error, just log it.
|
||||
|
||||
return final_success_list
|
||||
except Exception as e:
|
||||
if 'HTTP Error 403: Forbidden' in str(e):
|
||||
@ -1464,7 +1537,7 @@ with DAG(
|
||||
'assigned_proxy_url': Param(None, type=["string", "null"], title="[Worker Param] Assigned Proxy URL", description="A specific proxy URL to use for the request, overriding the server's proxy pool logic."),
|
||||
'clients': Param('tv_simply', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"),
|
||||
'timeout': Param(DEFAULT_TIMEOUT, type="integer"),
|
||||
'output_path_template': Param("%(title)s [%(id)s].f%(format_id)s.%(ext)s", type="string", title="[Worker Param] Output Path Template", description="Output filename template for yt-dlp. It is highly recommended to include `%(format_id)s` to prevent filename collisions when downloading multiple formats."),
|
||||
'output_path_template': Param("%(id)s.f%(format_id)s.%(ext)s", type="string", title="[Worker Param] Output Path Template", description="Output filename template for yt-dlp. It is highly recommended to include `%(format_id)s` to prevent filename collisions when downloading multiple formats."),
|
||||
'on_auth_failure': Param(
|
||||
'retry_with_new_account',
|
||||
type="string",
|
||||
|
||||
@ -322,6 +322,7 @@ with DAG(
|
||||
orchestrate_task = PythonOperator(
|
||||
task_id='start_worker_loops',
|
||||
python_callable=orchestrate_workers_ignition_callable,
|
||||
queue='queue-mgmt',
|
||||
)
|
||||
orchestrate_task.doc_md = """
|
||||
### Start Worker Loops
|
||||
|
||||
@ -290,6 +290,7 @@ with DAG(
|
||||
orchestrate_task = PythonOperator(
|
||||
task_id='start_worker_loops',
|
||||
python_callable=orchestrate_workers_ignition_callable,
|
||||
queue='queue-mgmt',
|
||||
)
|
||||
orchestrate_task.doc_md = """
|
||||
### Start Worker Loops
|
||||
|
||||
@ -175,9 +175,14 @@ def _get_thrift_client(host, port, timeout):
|
||||
return client, transport
|
||||
|
||||
def _extract_video_id(url):
|
||||
"""Extracts YouTube video ID from URL."""
|
||||
"""Extracts YouTube video ID from a URL or returns the input if it's already a valid ID."""
|
||||
if not url or not isinstance(url, str):
|
||||
return None
|
||||
|
||||
# Check if the input is already a valid 11-character video ID
|
||||
if re.fullmatch(r'[a-zA-Z0-9_-]{11}', url):
|
||||
return url
|
||||
|
||||
patterns = [r'v=([a-zA-Z0-9_-]{11})', r'youtu\.be/([a-zA-Z0-9_-]{11})']
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
@ -389,7 +394,7 @@ def get_token(initial_data: dict, **context):
|
||||
|
||||
account_id = initial_data['account_id']
|
||||
url = initial_data['url_to_process']
|
||||
info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')
|
||||
info_json_dir = os.path.join(Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles'), 'videos', 'in-progress')
|
||||
|
||||
host, port = params['service_ip'], int(params['service_port'])
|
||||
machine_id = params.get('machine_id') or socket.gethostname()
|
||||
@ -468,6 +473,7 @@ def get_token(initial_data: dict, **context):
|
||||
|
||||
return {
|
||||
'info_json_path': info_json_path,
|
||||
'job_dir_path': job_dir_path,
|
||||
'socks_proxy': proxy,
|
||||
'ytdlp_command': None,
|
||||
'successful_account_id': account_id,
|
||||
|
||||
@ -36,6 +36,7 @@ import socket
|
||||
import time
|
||||
import traceback
|
||||
import uuid
|
||||
import shutil
|
||||
|
||||
# Import utility functions and Thrift modules
|
||||
from utils.redis_utils import _get_redis_client
|
||||
@ -128,9 +129,14 @@ DEFAULT_ARGS = {
|
||||
# --- Helper Functions ---
|
||||
|
||||
def _extract_video_id(url):
|
||||
"""Extracts YouTube video ID from URL."""
|
||||
"""Extracts YouTube video ID from a URL or returns the input if it's already a valid ID."""
|
||||
if not url or not isinstance(url, str):
|
||||
return None
|
||||
|
||||
# Check if the input is already a valid 11-character video ID
|
||||
if re.fullmatch(r'[a-zA-Z0-9_-]{11}', url):
|
||||
return url
|
||||
|
||||
patterns = [r'v=([a-zA-Z0-9_-]{11})', r'youtu\.be/([a-zA-Z0-9_-]{11})']
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
@ -288,7 +294,25 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
if not (info_json_path and os.path.exists(info_json_path)):
|
||||
raise AirflowException(f"Error: info.json path is missing or file does not exist ({info_json_path}).")
|
||||
|
||||
download_dir = os.path.dirname(info_json_path)
|
||||
# WORKAROUND: The auth service may inject a 'js_runtimes' key into the info.json
|
||||
# that is incompatible with the yt-dlp library's expectations, causing a crash.
|
||||
# We remove it here before passing it to the download tool.
|
||||
try:
|
||||
with open(info_json_path, 'r+', encoding='utf-8') as f:
|
||||
info_data = json.load(f)
|
||||
if 'js_runtimes' in info_data:
|
||||
logger.info("Found 'js_runtimes' key in info.json. Removing it as a workaround for yt-dlp library incompatibility.")
|
||||
del info_data['js_runtimes']
|
||||
f.seek(0)
|
||||
json.dump(info_data, f)
|
||||
f.truncate()
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not process/remove 'js_runtimes' from info.json: {e}", exc_info=True)
|
||||
|
||||
download_dir = token_data.get('job_dir_path')
|
||||
if not download_dir:
|
||||
# Fallback for older runs or if job_dir_path is missing
|
||||
download_dir = os.path.dirname(info_json_path)
|
||||
|
||||
format_preset = params.get('download_format_preset', 'best_audio')
|
||||
if format_preset == 'custom':
|
||||
@ -322,20 +346,26 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
cmd.extend(['--output-dir', download_dir])
|
||||
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
|
||||
|
||||
# WORKAROUND: Due to an incompatibility between ytops-client and a recent yt-dlp
|
||||
# library update, passing --extra-ytdlp-args to the 'py' downloader causes a crash.
|
||||
# These arguments are being omitted until ytops-client is fixed.
|
||||
# This affects: fragment_retries, limit_rate, socket_timeout, sleep_interval,
|
||||
# max_sleep_interval, yt_dlp_test_mode, and the 'yt_dlp_extra_args' DAG param.
|
||||
has_extra_args = (
|
||||
params.get('fragment_retries') or params.get('limit_rate') or
|
||||
params.get('socket_timeout') or params.get('min_sleep_interval') or
|
||||
params.get('max_sleep_interval') or params.get('yt_dlp_test_mode') or
|
||||
params.get('yt_dlp_extra_args')
|
||||
)
|
||||
if has_extra_args:
|
||||
logger.warning("WORKAROUND: Omitting --extra-ytdlp-args for 'py' downloader due to a known incompatibility. "
|
||||
"Some download parameters will be ignored.")
|
||||
# The 'py' tool maps many yt-dlp flags via --extra-ytdlp-args
|
||||
py_extra_args = []
|
||||
if params.get('fragment_retries'):
|
||||
py_extra_args.extend(['--fragment-retries', str(params['fragment_retries'])])
|
||||
if params.get('limit_rate'):
|
||||
py_extra_args.extend(['--limit-rate', params['limit_rate']])
|
||||
if params.get('socket_timeout'):
|
||||
py_extra_args.extend(['--socket-timeout', str(params['socket_timeout'])])
|
||||
if params.get('min_sleep_interval'):
|
||||
py_extra_args.extend(['--sleep-interval', str(params['min_sleep_interval'])])
|
||||
if params.get('max_sleep_interval'):
|
||||
py_extra_args.extend(['--max-sleep-interval', str(params['max_sleep_interval'])])
|
||||
if params.get('yt_dlp_test_mode'):
|
||||
py_extra_args.append('--test')
|
||||
|
||||
existing_extra = shlex.split(params.get('yt_dlp_extra_args') or '')
|
||||
final_extra_args_list = existing_extra + py_extra_args
|
||||
if final_extra_args_list:
|
||||
final_extra_args_str = shlex.join(final_extra_args_list)
|
||||
cmd.extend(['--extra-ytdlp-args', final_extra_args_str])
|
||||
|
||||
elif downloader == 'aria-rpc':
|
||||
cmd.extend([
|
||||
@ -376,7 +406,10 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
cmd.extend(['--extra-ytdlp-args', shlex.join(final_extra_args)])
|
||||
|
||||
copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
|
||||
logger.info(f"Executing download command for format '{format_selector}': {copy_paste_cmd}")
|
||||
logger.info(f"--- Preparing to execute ytops-client ---")
|
||||
logger.info(f"Full ytops-client command for format '{format_selector}':")
|
||||
logger.info(copy_paste_cmd)
|
||||
logger.info(f"-----------------------------------------")
|
||||
process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
|
||||
|
||||
if process.stdout:
|
||||
@ -487,9 +520,20 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
final_formats_to_download = formats_to_download_initial
|
||||
else:
|
||||
for selector in formats_to_download_initial:
|
||||
# A selector can be '140' or '299/298/137'
|
||||
# A selector can be '140' or '299/298/137' or '140-dashy'
|
||||
individual_ids = re.split(r'[/+]', selector)
|
||||
if any(fid in available_formats for fid in individual_ids):
|
||||
|
||||
# Extract the numeric part of the format ID for checking against available_formats
|
||||
is_available = False
|
||||
for fid in individual_ids:
|
||||
numeric_id_match = re.match(r'^\d+', fid)
|
||||
if numeric_id_match:
|
||||
numeric_id = numeric_id_match.group(0)
|
||||
if numeric_id in available_formats:
|
||||
is_available = True
|
||||
break # Found a match, no need to check other parts of the selector
|
||||
|
||||
if is_available:
|
||||
final_formats_to_download.append(selector)
|
||||
else:
|
||||
logger.warning(f"Requested format selector '{selector}' contains no available formats. Skipping.")
|
||||
@ -504,54 +548,51 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
logger.info(f"Test mode: yt-dlp returned {len(successful_files)} filenames. Skipping probe failure checks.")
|
||||
if not successful_files:
|
||||
raise AirflowException("Test run did not produce any filenames.")
|
||||
return successful_files
|
||||
# Do not return here. Proceed to the cleanup and move logic.
|
||||
|
||||
if not failed_files:
|
||||
if not successful_files:
|
||||
raise AirflowException("Download and probe process completed but produced no valid files.")
|
||||
return successful_files
|
||||
final_success_list = successful_files
|
||||
if failed_files:
|
||||
# --- Handle Probe Failures and Retry ---
|
||||
if not retry_on_probe_failure:
|
||||
raise AirflowException(f"Probe failed for {len(failed_files)} file(s) and retry is disabled: {failed_files}")
|
||||
|
||||
# --- Handle Probe Failures and Retry ---
|
||||
if not retry_on_probe_failure:
|
||||
raise AirflowException(f"Probe failed for {len(failed_files)} file(s) and retry is disabled: {failed_files}")
|
||||
logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...")
|
||||
|
||||
logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...")
|
||||
format_ids_to_retry = []
|
||||
# Since each download is now for a specific selector and the output template
|
||||
# includes the format_id, we can always attempt to extract the format_id
|
||||
# from the failed filename for a targeted retry.
|
||||
for f in failed_files:
|
||||
match = re.search(r'\.f([\d]+)\.', f)
|
||||
if match:
|
||||
format_ids_to_retry.append(match.group(1))
|
||||
else:
|
||||
logger.error(f"Could not extract format_id from failed file '{f}'. Cannot retry this specific file.")
|
||||
formats_to_download_retry = format_ids_to_retry
|
||||
|
||||
format_ids_to_retry = []
|
||||
# Since each download is now for a specific selector and the output template
|
||||
# includes the format_id, we can always attempt to extract the format_id
|
||||
# from the failed filename for a targeted retry.
|
||||
for f in failed_files:
|
||||
match = re.search(r'\.f([\d]+)\.', f)
|
||||
if match:
|
||||
format_ids_to_retry.append(match.group(1))
|
||||
else:
|
||||
logger.error(f"Could not extract format_id from failed file '{f}'. Cannot retry this specific file.")
|
||||
formats_to_download_retry = format_ids_to_retry
|
||||
if not formats_to_download_retry:
|
||||
raise AirflowException("Probe failed, but could not determine which formats to retry.")
|
||||
|
||||
if not formats_to_download_retry:
|
||||
raise AirflowException("Probe failed, but could not determine which formats to retry.")
|
||||
# Rename failed files to allow for a fresh download attempt
|
||||
for f in failed_files:
|
||||
try:
|
||||
failed_path = f"{f}.probe_failed_{int(time.time())}"
|
||||
os.rename(f, failed_path)
|
||||
logger.info(f"Renamed corrupted file to {failed_path}")
|
||||
except OSError as rename_err:
|
||||
logger.error(f"Could not rename corrupted file '{f}': {rename_err}")
|
||||
|
||||
# Rename failed files to allow for a fresh download attempt
|
||||
for f in failed_files:
|
||||
try:
|
||||
failed_path = f"{f}.probe_failed_{int(time.time())}"
|
||||
os.rename(f, failed_path)
|
||||
logger.info(f"Renamed corrupted file to {failed_path}")
|
||||
except OSError as rename_err:
|
||||
logger.error(f"Could not rename corrupted file '{f}': {rename_err}")
|
||||
# --- Retry Download and Probe ---
|
||||
retried_successful_files, retried_failed_files = _download_and_probe_formats(formats_to_download_retry)
|
||||
|
||||
# --- Retry Download and Probe ---
|
||||
retried_successful_files, retried_failed_files = _download_and_probe_formats(formats_to_download_retry)
|
||||
if retried_failed_files:
|
||||
logger.error(f"Probe failed again for {len(retried_failed_files)} file(s) after retry: {retried_failed_files}")
|
||||
|
||||
if retried_failed_files:
|
||||
logger.error(f"Probe failed again for {len(retried_failed_files)} file(s) after retry: {retried_failed_files}")
|
||||
final_success_list = successful_files + retried_successful_files
|
||||
logger.info(f"Retry complete. Final success count: {len(final_success_list)} file(s).")
|
||||
|
||||
final_success_list = successful_files + retried_successful_files
|
||||
if not final_success_list:
|
||||
raise AirflowException("All files failed to download or probe correctly, even after retry.")
|
||||
|
||||
logger.info(f"Retry complete. Final success count: {len(final_success_list)} file(s).")
|
||||
raise AirflowException("Download and probe process completed but produced no valid files.")
|
||||
|
||||
if params.get('yt_dlp_cleanup_mode', True):
|
||||
logger.info(f"Cleanup mode is enabled. Creating .empty files and deleting originals for {len(final_success_list)} files.")
|
||||
@ -567,6 +608,35 @@ def download_and_probe(token_data: dict, available_formats: list[str], **context
|
||||
logger.error(f"Error during cleanup for file {f}: {e}", exc_info=True)
|
||||
# Do not fail the task for a cleanup error, just log it.
|
||||
|
||||
# --- Move completed job directory to final destination ---
|
||||
try:
|
||||
video_id = _extract_video_id(original_url)
|
||||
if not video_id:
|
||||
logger.error(f"Could not extract video_id from URL '{original_url}' for final move. Skipping.")
|
||||
else:
|
||||
source_dir = download_dir # This is the job_dir_path
|
||||
|
||||
# Group downloads into 10-minute batch folders based on completion time.
|
||||
now = datetime.now()
|
||||
rounded_minute = (now.minute // 10) * 10
|
||||
timestamp_str = now.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}"
|
||||
|
||||
final_dir_base = os.path.join(Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles'), 'videos', 'ready', timestamp_str)
|
||||
final_dir_path = os.path.join(final_dir_base, video_id)
|
||||
|
||||
os.makedirs(final_dir_base, exist_ok=True)
|
||||
|
||||
logger.info(f"Moving completed job from '{source_dir}' to final destination '{final_dir_path}'")
|
||||
if os.path.exists(final_dir_path):
|
||||
logger.warning(f"Destination '{final_dir_path}' already exists. It will be removed and replaced.")
|
||||
shutil.rmtree(final_dir_path)
|
||||
|
||||
shutil.move(source_dir, final_dir_path)
|
||||
logger.info(f"Successfully moved job to '{final_dir_path}'.")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to move completed job directory: {e}", exc_info=True)
|
||||
# Do not fail the task for a move error, just log it.
|
||||
|
||||
return final_success_list
|
||||
|
||||
@task
|
||||
@ -799,7 +869,7 @@ with DAG(
|
||||
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string"),
|
||||
'machine_id': Param(None, type=["string", "null"]),
|
||||
'clients': Param('mweb,web_camoufox,tv', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"),
|
||||
'output_path_template': Param("%(title)s [%(id)s].f%(format_id)s.%(ext)s", type="string", title="[Worker Param] Output Path Template", description="Output filename template for yt-dlp. It is highly recommended to include `%(format_id)s` to prevent filename collisions when downloading multiple formats."),
|
||||
'output_path_template': Param("%(id)s.f%(format_id)s.%(ext)s", type="string", title="[Worker Param] Output Path Template", description="Output filename template for yt-dlp. It is highly recommended to include `%(format_id)s` to prevent filename collisions when downloading multiple formats."),
|
||||
'retry_on_probe_failure': Param(False, type="boolean"),
|
||||
'skip_probe': Param(False, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
|
||||
'yt_dlp_cleanup_mode': Param(True, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
|
||||
|
||||
417
airflow/dags/ytdlp_s3_uploader.py
Normal file
417
airflow/dags/ytdlp_s3_uploader.py
Normal file
@ -0,0 +1,417 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
DAG to upload completed video directories to an S3-compatible service.
|
||||
This DAG creates one long-running task for each configured S3 worker.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from airflow.decorators import task
|
||||
from airflow.exceptions import AirflowException
|
||||
from airflow.models.dag import DAG
|
||||
from airflow.models.param import Param
|
||||
from airflow.models.variable import Variable
|
||||
from airflow.operators.dummy import DummyOperator
|
||||
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
||||
from airflow.utils.dates import days_ago
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_ARGS = {
|
||||
'owner': 'airflow',
|
||||
'retries': 1,
|
||||
'retry_delay': timedelta(minutes=1),
|
||||
}
|
||||
|
||||
BASE_DOWNLOAD_PATH = '/opt/airflow/downloadfiles'
|
||||
VIDEOS_PATH = os.path.join(BASE_DOWNLOAD_PATH, 'videos')
|
||||
READY_PATH = os.path.join(VIDEOS_PATH, 'ready')
|
||||
|
||||
def run_s3_upload_batch(**context):
|
||||
"""
|
||||
This function runs in a continuous loop to check for completed video directories and upload them to S3.
|
||||
If no videos are found, it sleeps for a configurable interval before checking again.
|
||||
Dry run mode is non-destructive and will pause briefly after checking to prevent tight loops.
|
||||
"""
|
||||
params = context['params']
|
||||
concurrency = params['concurrency']
|
||||
mode = params['mode']
|
||||
dry_run = params['dry_run']
|
||||
sleep_interval_min = params['sleep_if_no_videos_min']
|
||||
sleep_interval_sec = sleep_interval_min * 60
|
||||
s3_conn_id = params['s3_conn_id']
|
||||
|
||||
s3_access_key_id = None
|
||||
s3_secret_access_key = None
|
||||
s3_endpoint = None
|
||||
s3_bucket = None
|
||||
s3_region = None
|
||||
config_source = "Unknown"
|
||||
profile_name = "rusonyx"
|
||||
|
||||
# --- Attempt 1: Get S3 Configuration from Airflow Connection ---
|
||||
if s3_conn_id:
|
||||
try:
|
||||
logger.info(f"Attempting to load S3 configuration from Airflow connection '{s3_conn_id}'.")
|
||||
s3_hook = S3Hook(aws_conn_id=s3_conn_id)
|
||||
s3_conn = s3_hook.get_connection(s3_conn_id)
|
||||
|
||||
s3_access_key_id = s3_conn.login
|
||||
s3_secret_access_key = s3_conn.password
|
||||
s3_endpoint = s3_conn.host
|
||||
|
||||
extra_config = s3_conn.extra_dejson
|
||||
s3_bucket = extra_config.get('bucket')
|
||||
s3_region = extra_config.get('region_name')
|
||||
|
||||
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_bucket, s3_region]):
|
||||
logger.warning("S3 connection from Airflow is missing one or more required fields. Will attempt to fall back to environment variables.")
|
||||
s3_access_key_id = s3_secret_access_key = s3_endpoint = s3_bucket = s3_region = None # Reset all
|
||||
else:
|
||||
config_source = f"Airflow Connection '{s3_conn_id}'"
|
||||
profile_name = "rusonyx-airflow"
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load S3 configuration from Airflow connection '{s3_conn_id}': {e}. Will attempt to fall back to environment variables.")
|
||||
|
||||
# --- Attempt 2: Fallback to Environment Variables ---
|
||||
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_bucket, s3_region]):
|
||||
try:
|
||||
logger.info("Attempting to load S3 configuration from environment variables as a fallback.")
|
||||
s3_access_key_id = os.environ['S3_DELIVERY_AWS_ACCESS_KEY_ID']
|
||||
s3_secret_access_key = os.environ['S3_DELIVERY_AWS_SECRET_ACCESS_KEY']
|
||||
s3_endpoint = os.environ['S3_DELIVERY_ENDPOINT']
|
||||
s3_bucket = os.environ['S3_DELIVERY_BUCKET']
|
||||
s3_region = os.environ['S3_DELIVERY_AWS_REGION']
|
||||
|
||||
if not all([s3_access_key_id, s3_secret_access_key, s3_endpoint, s3_bucket, s3_region]):
|
||||
raise ValueError("One or more S3 configuration environment variables are empty.")
|
||||
config_source = "Environment Variables"
|
||||
profile_name = "rusonyx"
|
||||
|
||||
except (KeyError, ValueError) as e:
|
||||
logger.error(f"Having problems reading S3 configuration from environment variables: {e}", exc_info=True)
|
||||
raise AirflowException("S3 configuration is missing. Could not load from Airflow connection or environment variables.")
|
||||
|
||||
s3_destination = f"s3://{s3_bucket}/"
|
||||
|
||||
logger.info(f"Starting S3 upload loop. Watching source '{READY_PATH}' for delivery to '{s3_destination}'.")
|
||||
logger.info(f"Mode: {mode}, Dry Run: {dry_run}, Idle Sleep: {sleep_interval_min} min")
|
||||
logger.info(f"S3 Config loaded from {config_source}: Endpoint='{s3_endpoint}', Bucket='{s3_bucket}', Region='{s3_region}', Profile='{profile_name}'")
|
||||
|
||||
# --- Write credentials to file for s5cmd profile ---
|
||||
aws_credentials_path = os.path.expanduser("~/.aws/credentials")
|
||||
aws_config_path = os.path.expanduser("~/.aws/config")
|
||||
|
||||
try:
|
||||
os.makedirs(os.path.dirname(aws_credentials_path), exist_ok=True)
|
||||
|
||||
with open(aws_credentials_path, 'w') as f:
|
||||
f.write(f"[{profile_name}]\n")
|
||||
f.write(f"aws_access_key_id = {s3_access_key_id}\n")
|
||||
f.write(f"aws_secret_access_key = {s3_secret_access_key}\n")
|
||||
logger.info(f"Wrote credentials for profile '{profile_name}' to {aws_credentials_path}")
|
||||
|
||||
with open(aws_config_path, 'w') as f:
|
||||
f.write(f"[profile {profile_name}]\n")
|
||||
f.write(f"region = {s3_region}\n")
|
||||
logger.info(f"Wrote config for profile '{profile_name}' to {aws_config_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to write AWS credentials/config file: {e}", exc_info=True)
|
||||
raise AirflowException(f"Failed to write AWS credentials/config file: {e}")
|
||||
|
||||
while True:
|
||||
logger.info("--- Starting new S3 upload cycle ---")
|
||||
|
||||
# --- Dry Run Logic (Non-destructive) ---
|
||||
if dry_run:
|
||||
logger.info("[DRY RUN] Checking for completed video batches...")
|
||||
if not os.path.exists(READY_PATH):
|
||||
logger.info(f"[DRY RUN] Source directory '{READY_PATH}' does not exist. Nothing to upload.")
|
||||
else:
|
||||
now = datetime.now()
|
||||
wait_minutes = params['batch_completion_wait_min']
|
||||
cutoff_time = now - timedelta(minutes=wait_minutes)
|
||||
rounded_minute = (cutoff_time.minute // 10) * 10
|
||||
cutoff_batch_ts = cutoff_time.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}"
|
||||
logger.info(f"[DRY RUN] Current time is {now.strftime('%H:%M:%S')}. With a {wait_minutes} min wait, processing batches up to and including '{cutoff_batch_ts}'.")
|
||||
|
||||
all_video_dirs_to_process = []
|
||||
processed_batch_dirs = set()
|
||||
all_batch_dirs = sorted([d for d in os.listdir(READY_PATH) if os.path.isdir(os.path.join(READY_PATH, d))])
|
||||
|
||||
for ts_dir in all_batch_dirs:
|
||||
if ts_dir > cutoff_batch_ts:
|
||||
continue
|
||||
|
||||
batch_dir_path = os.path.join(READY_PATH, ts_dir)
|
||||
video_dirs_in_batch = [os.path.join(batch_dir_path, d) for d in os.listdir(batch_dir_path) if os.path.isdir(os.path.join(batch_dir_path, d))]
|
||||
|
||||
if video_dirs_in_batch:
|
||||
all_video_dirs_to_process.extend(video_dirs_in_batch)
|
||||
processed_batch_dirs.add(batch_dir_path)
|
||||
else:
|
||||
logger.info(f"[DRY RUN] Batch directory '{batch_dir_path}' is empty. Would remove it.")
|
||||
|
||||
if all_video_dirs_to_process:
|
||||
logger.info(f"[DRY RUN] Found {len(all_video_dirs_to_process)} total video director(y/ies) in {len(processed_batch_dirs)} batch(es) to process.")
|
||||
|
||||
# Construct and log the command that would be run
|
||||
cmd = [
|
||||
's5cmd', '--endpoint-url', s3_endpoint, '--log', 'debug', '--no-verify-ssl',
|
||||
'--use-list-objects-v1', '--profile', profile_name, '--stat',
|
||||
'--numworkers', str(concurrency), 'run'
|
||||
]
|
||||
cmd_str = ' '.join(cmd)
|
||||
|
||||
# Construct the commands to be piped
|
||||
commands_to_pipe = '\n'.join([f"cp \"{dir_path}\" \"{s3_destination}\"" for dir_path in all_video_dirs_to_process])
|
||||
|
||||
logger.info(f"[DRY RUN] The following command would be executed:\n{cmd_str}")
|
||||
logger.info(f"[DRY RUN] The following commands would be piped to stdin:\n{commands_to_pipe}")
|
||||
|
||||
if mode == 'mv':
|
||||
logger.info(f"[DRY RUN] Mode is 'mv'. Would delete {len(processed_batch_dirs)} source batch directories after successful upload.")
|
||||
|
||||
# Pause briefly in dry-run mode if videos are found to avoid a fast, noisy loop.
|
||||
dry_run_pause_s = 10
|
||||
logger.info(f"[DRY RUN] Pausing for {dry_run_pause_s} seconds to prevent rapid re-listing of the same files (this is a short, fixed pause for dry-run only).")
|
||||
time.sleep(dry_run_pause_s)
|
||||
continue # Go to the start of the next cycle
|
||||
else:
|
||||
logger.info("[DRY RUN] No completed video batches found.")
|
||||
|
||||
# If in dry-run and no videos are found, sleep for the main interval.
|
||||
logger.info(f"[DRY RUN] Sleeping for {sleep_interval_min} minute(s)...")
|
||||
time.sleep(sleep_interval_sec)
|
||||
continue
|
||||
|
||||
# --- Normal Operation Logic (Destructive) ---
|
||||
work_done_in_cycle = False
|
||||
try:
|
||||
# --- 1. Find all videos to upload from all completed batches ---
|
||||
if not os.path.exists(READY_PATH):
|
||||
logger.info(f"Ready directory '{READY_PATH}' does not exist. Nothing to upload.")
|
||||
else:
|
||||
now = datetime.now()
|
||||
wait_minutes = params['batch_completion_wait_min']
|
||||
cutoff_time = now - timedelta(minutes=wait_minutes)
|
||||
rounded_minute = (cutoff_time.minute // 10) * 10
|
||||
cutoff_batch_ts = cutoff_time.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}"
|
||||
logger.info(f"Current time is {now.strftime('%H:%M:%S')}. With a {wait_minutes} min wait, processing batches up to and including '{cutoff_batch_ts}'.")
|
||||
|
||||
all_video_dirs_to_process = []
|
||||
processed_batch_dirs = set()
|
||||
all_batch_dirs = sorted([d for d in os.listdir(READY_PATH) if os.path.isdir(os.path.join(READY_PATH, d))])
|
||||
|
||||
for ts_dir in all_batch_dirs:
|
||||
if ts_dir > cutoff_batch_ts:
|
||||
continue # This batch is not old enough to be processed
|
||||
|
||||
batch_dir_path = os.path.join(READY_PATH, ts_dir)
|
||||
video_dirs_in_batch = [os.path.join(batch_dir_path, d) for d in os.listdir(batch_dir_path) if os.path.isdir(os.path.join(batch_dir_path, d))]
|
||||
|
||||
if not video_dirs_in_batch:
|
||||
logger.info(f"Batch directory '{batch_dir_path}' is empty. Removing it.")
|
||||
try:
|
||||
os.rmdir(batch_dir_path)
|
||||
except OSError as e:
|
||||
logger.error(f"Could not remove empty batch directory {batch_dir_path}: {e}")
|
||||
continue # Move to the next batch
|
||||
|
||||
all_video_dirs_to_process.extend(video_dirs_in_batch)
|
||||
processed_batch_dirs.add(batch_dir_path)
|
||||
|
||||
# --- 2. Upload All Found Videos in a Single Batch Command ---
|
||||
if all_video_dirs_to_process:
|
||||
work_done_in_cycle = True
|
||||
logger.info(f"Found {len(all_video_dirs_to_process)} total video director(y/ies) in {len(processed_batch_dirs)} batch(es) to upload.")
|
||||
|
||||
cmd = [
|
||||
's5cmd', '--endpoint-url', s3_endpoint, '--log', 'debug', '--no-verify-ssl',
|
||||
'--use-list-objects-v1', '--profile', profile_name, '--stat',
|
||||
'--numworkers', str(concurrency), 'run'
|
||||
]
|
||||
cmd_str = ' '.join(cmd)
|
||||
|
||||
# Construct the commands to be piped to stdin
|
||||
commands_to_pipe = '\n'.join([f"cp \"{dir_path}\" \"{s3_destination}\"" for dir_path in all_video_dirs_to_process])
|
||||
|
||||
logger.info(f"Executing s5cmd batch command:\n{cmd_str}")
|
||||
logger.info(f"Piping {len(all_video_dirs_to_process)} 'cp' commands to stdin.")
|
||||
|
||||
upload_start_time = time.time()
|
||||
process = subprocess.run(cmd, check=True, capture_output=True, text=True, input=commands_to_pipe)
|
||||
upload_duration = time.time() - upload_start_time
|
||||
|
||||
logger.info(f"s5cmd STDOUT: {process.stdout}")
|
||||
if process.stderr:
|
||||
logger.info(f"s5cmd STDERR: {process.stderr}")
|
||||
logger.info(f"Upload command completed successfully in {upload_duration:.2f} seconds.")
|
||||
logger.info(f"Successfully copied {len(all_video_dirs_to_process)} director(y/ies) to S3.")
|
||||
|
||||
# --- 3. Cleanup ---
|
||||
if mode == 'mv':
|
||||
logger.info(f"Mode is 'mv'. Cleaning up {len(processed_batch_dirs)} source batch director(y/ies).")
|
||||
cleanup_start_time = time.time()
|
||||
|
||||
# Create a temporary empty directory to use as a source for rsync deletion
|
||||
empty_dir_for_rsync = os.path.join(READY_PATH, f"__empty_{int(time.time())}")
|
||||
os.makedirs(empty_dir_for_rsync, exist_ok=True)
|
||||
|
||||
try:
|
||||
for batch_dir_path in processed_batch_dirs:
|
||||
try:
|
||||
# Use rsync with an empty source to efficiently delete the contents of the batch directory
|
||||
# The trailing slash on both source and destination is important.
|
||||
rsync_cmd = [
|
||||
'rsync',
|
||||
'-a', '--delete',
|
||||
f'{empty_dir_for_rsync}/',
|
||||
f'{batch_dir_path}/'
|
||||
]
|
||||
subprocess.run(rsync_cmd, check=True, capture_output=True, text=True)
|
||||
|
||||
# After the contents are deleted, remove the now-empty directory
|
||||
os.rmdir(batch_dir_path)
|
||||
logger.info(f"Successfully removed {batch_dir_path}")
|
||||
except Exception as cleanup_e:
|
||||
logger.error(f"Failed to remove directory {batch_dir_path}: {cleanup_e}", exc_info=True)
|
||||
if isinstance(cleanup_e, subprocess.CalledProcessError):
|
||||
logger.error(f"rsync STDERR: {cleanup_e.stderr}")
|
||||
finally:
|
||||
# Clean up the temporary empty directory
|
||||
shutil.rmtree(empty_dir_for_rsync)
|
||||
|
||||
cleanup_duration = time.time() - cleanup_start_time
|
||||
logger.info(f"Cleanup complete in {cleanup_duration:.2f} seconds.")
|
||||
else: # mode == 'cp'
|
||||
logger.info(f"Mode is 'cp'. Source directories will be left for inspection.")
|
||||
|
||||
if not work_done_in_cycle:
|
||||
logger.info(f"No completed video batches found in '{READY_PATH}'.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred during the S3 upload cycle: {e}", exc_info=True)
|
||||
if isinstance(e, subprocess.CalledProcessError):
|
||||
logger.error(f"s5cmd STDERR: {e.stderr}")
|
||||
# On error, we do NOT clean up, to allow for investigation and retries.
|
||||
# The failed directories will be picked up in the next cycle.
|
||||
# Treat errors as "no work done" to trigger sleep and prevent fast failure loops
|
||||
work_done_in_cycle = False
|
||||
|
||||
# --- Loop Control ---
|
||||
if not work_done_in_cycle:
|
||||
logger.info(f"No work done in this cycle. Sleeping for {sleep_interval_min} minute(s)...")
|
||||
time.sleep(sleep_interval_sec)
|
||||
else:
|
||||
logger.info("Work was completed in this cycle. Checking for more immediately.")
|
||||
|
||||
with DAG(
|
||||
dag_id='ytdlp_s3_uploader',
|
||||
default_args=DEFAULT_ARGS,
|
||||
schedule=None,
|
||||
start_date=days_ago(1),
|
||||
catchup=False,
|
||||
tags=['ytdlp', 's3', 'upload'],
|
||||
doc_md="""### S3 Uploader DAG
|
||||
|
||||
1. This DAG creates dynamic uploader tasks with clear names depicting their worker machine (e.g., `upload_batch_on_dl001`).
|
||||
2. Ansible updates an Airflow Variable named `s3_worker_hostnames` with a JSON list of all active uploader workers (typically dlXXX machines). Each worker listens to its own queue (e.g., `queue-dl-dl001`).
|
||||
3. This DAG reads the variable on manual trigger or after a pause/resume cycle to create the dynamic tasks. This allows for easy inspection of per-worker logs and status from the Airflow UI.
|
||||
4. Each dynamic task watches a shared folder (`/opt/airflow/downloadfiles/videos/ready`). Download workers place completed videos into timestamped sub-folders (e.g., `20241122T1050`). The uploader processes these 10-minute batches, copying them to S3 with `s5cmd` and then deleting the source directories. This design avoids race conditions and improves performance.
|
||||
""",
|
||||
params={
|
||||
'mode': Param(
|
||||
'mv', type="string", enum=['cp', 'mv'], title="Operation Mode",
|
||||
description="`mv` (move): After a successful upload, the temporary batch directory is deleted. This is the standard behavior. `cp` (copy): The temporary batch directory is left intact for debugging; it will be cleaned up on the next run."
|
||||
),
|
||||
'dry_run': Param(
|
||||
True, type="boolean", title="Dry Run",
|
||||
description="If True, the DAG will perform all steps except the actual upload and cleanup. `s5cmd` will be run with `--dry-run`, and the final directory removal will be skipped. Log messages will indicate what would have happened."
|
||||
),
|
||||
'concurrency': Param(10, type="integer", title="s5cmd Concurrency"),
|
||||
'sleep_if_no_videos_min': Param(10, type="integer", title="Sleep if Idle (minutes)", description="How many minutes the task should sleep if no videos are found to upload."),
|
||||
'batch_completion_wait_min': Param(0, type="integer", title="Batch Completion Wait (minutes)", description="How many minutes to wait after a 10-minute batch window closes before considering it for upload. Default is 0, which processes the current batch immediately. A value of 10 restores the old behavior of waiting for the next 10-minute window."),
|
||||
's3_conn_id': Param('s3_delivery_connection', type="string", title="S3 Connection ID", description="The Airflow connection ID for the S3-compatible storage. If this connection is invalid or missing, the task will fall back to environment variables."),
|
||||
}
|
||||
) as dag:
|
||||
|
||||
# Dynamically create one task per S3 worker hostname
|
||||
# IMPORTANT: The tasks are created when this DAG file is parsed by the Airflow Scheduler.
|
||||
# If you add/change the 's3_worker_hostnames' Airflow Variable, you may need to
|
||||
# wait a few minutes for the scheduler to re-parse the file and update the tasks.
|
||||
# Forcing a re-parse can be done by pausing and un-pausing the DAG in the UI.
|
||||
s3_worker_hostnames = [] # Initialize to be safe
|
||||
try:
|
||||
# The variable should be a JSON list of strings, e.g., ["s3-001", "s3-002"]
|
||||
s3_worker_hostnames = Variable.get("s3_worker_hostnames", deserialize_json=True, default_var=[])
|
||||
logger.info(f"DAG 'ytdlp_s3_uploader' successfully loaded s3_worker_hostnames variable. Value: {s3_worker_hostnames}")
|
||||
if not isinstance(s3_worker_hostnames, list):
|
||||
logger.error(f"Airflow Variable 's3_worker_hostnames' is not a valid JSON list. Value: {s3_worker_hostnames}")
|
||||
s3_worker_hostnames = [] # Reset to empty to prevent errors
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Could not read or parse Airflow Variable 's3_worker_hostnames'. "
|
||||
f"Please create it in the Airflow UI as a JSON list of your S3 worker hostnames (e.g., [\"s3-001\"]). "
|
||||
f"No S3 worker tasks will be created. Error: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
s3_worker_hostnames = []
|
||||
|
||||
@task(task_id='check_s3_worker_configuration')
|
||||
def check_s3_worker_configuration_callable():
|
||||
"""Logs the current value of the s3_worker_hostnames variable at runtime for debugging."""
|
||||
logger.info("--- S3 Worker Configuration Check (at runtime) ---")
|
||||
try:
|
||||
hostnames = Variable.get("s3_worker_hostnames", deserialize_json=True, default_var=None)
|
||||
if hostnames is None:
|
||||
logger.error("Airflow Variable 's3_worker_hostnames' is not defined.")
|
||||
logger.info("Please create it in the Airflow UI (Admin -> Variables) as a JSON list of strings, e.g., [\"s3-worker-01\"]")
|
||||
elif not isinstance(hostnames, list):
|
||||
logger.error(f"Airflow Variable 's3_worker_hostnames' is not a valid JSON list. Current value: {hostnames}")
|
||||
elif not hostnames:
|
||||
logger.warning("Airflow Variable 's3_worker_hostnames' is defined but is an empty list []. No worker tasks will be run.")
|
||||
else:
|
||||
logger.info(f"Successfully read 's3_worker_hostnames'. It contains {len(hostnames)} worker(s): {hostnames}")
|
||||
logger.info("If you see this task but no worker tasks in the UI, it means the DAG did not find these workers when it was parsed by the scheduler.")
|
||||
logger.info("This can happen due to caching. Please wait a few minutes for the scheduler to re-parse the DAG file, or pause/un-pause the DAG.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred while trying to read the 's3_worker_hostnames' variable at runtime: {e}", exc_info=True)
|
||||
logger.info("--- End of Configuration Check ---")
|
||||
|
||||
check_s3_worker_configuration_task = check_s3_worker_configuration_callable()
|
||||
check_s3_worker_configuration_task.doc_md = """
|
||||
### S3 Worker Configuration Check
|
||||
|
||||
This task runs at the start of every DAG run to check the `s3_worker_hostnames` Airflow Variable.
|
||||
|
||||
The dynamic worker tasks are created based on this variable *at the time the DAG is parsed by the scheduler*.
|
||||
|
||||
**Check the logs for this task to see the current value of the variable as read at runtime.** This can help diagnose why worker tasks may not have been created.
|
||||
|
||||
If the logs show the variable is correct but you don't see the worker tasks in the UI, you may need to wait for the scheduler to re-parse the DAG file. You can force this by pausing and un-pausing the DAG.
|
||||
"""
|
||||
if s3_worker_hostnames:
|
||||
worker_tasks = []
|
||||
for hostname in s3_worker_hostnames:
|
||||
# Sanitize hostname for task_id
|
||||
task_id_hostname = hostname.replace('.', '_')
|
||||
|
||||
# Create a task for each worker, pinned to its specific queue
|
||||
upload_task = task(
|
||||
task_id=f'upload_batch_on_{task_id_hostname}',
|
||||
queue=f'queue-s3-{hostname}'
|
||||
)(run_s3_upload_batch)()
|
||||
worker_tasks.append(upload_task)
|
||||
|
||||
check_s3_worker_configuration_task >> worker_tasks
|
||||
3767
airflow/inputfiles/urls.rt.txt
Normal file
3767
airflow/inputfiles/urls.rt.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -8,3 +8,8 @@ vault_ss_password_2: "tgtQcfjJp/A3F01g4woO0bEQoxij3CAOK/iR1OTPuF4="
|
||||
vault_dockerhub_password: "dckr_pat_DmFFqwFEdXFvZlgngGY9ooBaq6o"
|
||||
vault_s3_access_key_id: "admin"
|
||||
vault_s3_secret_access_key: "0153093693-0009"
|
||||
vault_s3_delivery_access_key_id: "4d33e37e87c945718478e8003f6e93fb"
|
||||
vault_s3_delivery_secret_access_key: "33b155c5d2ea4fccb0faeeefb420d7ac"
|
||||
vault_s3_delivery_endpoint: "https://s3.rusonyxcloud.ru"
|
||||
vault_s3_delivery_bucket: "videos"
|
||||
vault_s3_delivery_aws_region: "ru-msk"
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
- name: Deploy Airflow DL Worker Stack
|
||||
hosts: airflow_workers
|
||||
vars_files:
|
||||
- group_vars/all.yml
|
||||
- group_vars/all/vault.yml
|
||||
- "{{ inventory_dir }}/group_vars/all/vault.yml"
|
||||
- "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
|
||||
pre_tasks:
|
||||
- name: Announce fast deploy mode if enabled
|
||||
debug:
|
||||
@ -17,13 +17,17 @@
|
||||
path: "{{ airflow_worker_dir }}"
|
||||
state: directory
|
||||
owner: "{{ ansible_user }}"
|
||||
group: "{{ ansible_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: '0755'
|
||||
become: yes
|
||||
|
||||
- name: Template .env.worker
|
||||
- name: Template .env file for worker
|
||||
template:
|
||||
src: templates/.env.worker.j2
|
||||
src: templates/.env.j2
|
||||
dest: "{{ airflow_worker_dir }}/.env"
|
||||
mode: '0600'
|
||||
vars:
|
||||
service_role: "worker"
|
||||
|
||||
- name: Template docker-compose file for Airflow worker
|
||||
template:
|
||||
@ -31,6 +35,34 @@
|
||||
dest: "{{ airflow_worker_dir }}/configs/docker-compose-dl.yaml"
|
||||
mode: '0644'
|
||||
|
||||
- name: Ensure configs directory exists for config generator
|
||||
file:
|
||||
path: "{{ airflow_worker_dir }}/configs"
|
||||
state: directory
|
||||
owner: "{{ ansible_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: '0755'
|
||||
become: yes
|
||||
|
||||
- name: Sync config generator script
|
||||
ansible.posix.synchronize:
|
||||
src: ../airflow/generate_envoy_config.py
|
||||
dest: "{{ airflow_worker_dir }}/"
|
||||
archive: yes
|
||||
rsync_path: "sudo rsync"
|
||||
|
||||
- name: Sync config generator templates
|
||||
ansible.posix.synchronize:
|
||||
src: ../airflow/configs/{{ item }}
|
||||
dest: "{{ airflow_worker_dir }}/configs/"
|
||||
archive: yes
|
||||
rsync_path: "sudo rsync"
|
||||
loop:
|
||||
- docker-compose.config-generate.yaml
|
||||
- envoy.yaml.j2
|
||||
- docker-compose.camoufox.yaml.j2
|
||||
- docker-compose-ytdlp-ops.yaml.j2
|
||||
|
||||
- name: Build Airflow worker image from local Dockerfile
|
||||
community.docker.docker_image:
|
||||
name: "{{ airflow_image_name }}"
|
||||
@ -50,7 +82,7 @@
|
||||
|
||||
- name: Generate dynamic configs (camoufox + envoy)
|
||||
shell:
|
||||
cmd: "docker compose -f configs/docker-compose.config-generate.yaml run --rm config-generator"
|
||||
cmd: "docker compose --project-directory . -f configs/docker-compose.config-generate.yaml run --rm config-generator"
|
||||
chdir: "{{ airflow_worker_dir }}"
|
||||
|
||||
- name: Start worker services
|
||||
@ -59,6 +91,7 @@
|
||||
files:
|
||||
- configs/docker-compose-dl.yaml
|
||||
- configs/docker-compose-ytdlp-ops.yaml
|
||||
- configs/docker-compose.camoufox.yaml
|
||||
state: present
|
||||
remove_orphans: true
|
||||
pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"
|
||||
|
||||
@ -2,14 +2,30 @@
|
||||
- name: Deploy Airflow DL Worker Stack
|
||||
hosts: airflow_workers
|
||||
vars_files:
|
||||
- group_vars/all.yml
|
||||
- group_vars/all/vault.yml
|
||||
- "{{ inventory_dir }}/group_vars/all/vault.yml"
|
||||
- "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
|
||||
pre_tasks:
|
||||
- name: Announce fast deploy mode if enabled
|
||||
debug:
|
||||
msg: "🚀 FAST DEPLOY MODE ENABLED: Skipping Docker image builds and pulls. 🚀"
|
||||
when: fast_deploy | default(false)
|
||||
run_once: true
|
||||
|
||||
- name: Install python3-pip
|
||||
ansible.builtin.apt:
|
||||
name: python3-pip
|
||||
state: present
|
||||
become: yes
|
||||
|
||||
- name: Install required python packages for ytops-client on host
|
||||
ansible.builtin.pip:
|
||||
name:
|
||||
- thrift
|
||||
- aria2p
|
||||
- PyYAML
|
||||
state: present
|
||||
extra_args: --break-system-packages
|
||||
become: yes
|
||||
tasks:
|
||||
|
||||
- name: Ensure worker directory exists
|
||||
@ -17,13 +33,17 @@
|
||||
path: "{{ airflow_worker_dir }}"
|
||||
state: directory
|
||||
owner: "{{ ansible_user }}"
|
||||
group: "{{ ansible_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: '0755'
|
||||
become: yes
|
||||
|
||||
- name: Template .env.worker
|
||||
- name: Template .env file for worker
|
||||
template:
|
||||
src: templates/.env.worker.j2
|
||||
src: templates/.env.j2
|
||||
dest: "{{ airflow_worker_dir }}/.env"
|
||||
mode: '0600'
|
||||
vars:
|
||||
service_role: "worker"
|
||||
|
||||
- name: Template docker-compose file for Airflow worker
|
||||
template:
|
||||
@ -31,6 +51,34 @@
|
||||
dest: "{{ airflow_worker_dir }}/configs/docker-compose-dl.yaml"
|
||||
mode: '0644'
|
||||
|
||||
- name: Ensure configs directory exists for config generator
|
||||
file:
|
||||
path: "{{ airflow_worker_dir }}/configs"
|
||||
state: directory
|
||||
owner: "{{ ansible_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: '0755'
|
||||
become: yes
|
||||
|
||||
- name: Sync config generator script
|
||||
ansible.posix.synchronize:
|
||||
src: ../airflow/generate_envoy_config.py
|
||||
dest: "{{ airflow_worker_dir }}/"
|
||||
archive: yes
|
||||
rsync_path: "sudo rsync"
|
||||
|
||||
- name: Sync config generator templates
|
||||
ansible.posix.synchronize:
|
||||
src: ../airflow/configs/{{ item }}
|
||||
dest: "{{ airflow_worker_dir }}/configs/"
|
||||
archive: yes
|
||||
rsync_path: "sudo rsync"
|
||||
loop:
|
||||
- docker-compose.config-generate.yaml
|
||||
- envoy.yaml.j2
|
||||
- docker-compose.camoufox.yaml.j2
|
||||
- docker-compose-ytdlp-ops.yaml.j2
|
||||
|
||||
- name: Build Airflow worker image from local Dockerfile
|
||||
community.docker.docker_image:
|
||||
name: "{{ airflow_image_name }}"
|
||||
@ -49,7 +97,7 @@
|
||||
|
||||
- name: Generate dynamic configs (camoufox + envoy)
|
||||
shell:
|
||||
cmd: "docker compose -f configs/docker-compose.config-generate.yaml run --rm config-generator"
|
||||
cmd: "docker compose --project-directory . -f configs/docker-compose.config-generate.yaml run --rm config-generator"
|
||||
chdir: "{{ airflow_worker_dir }}"
|
||||
|
||||
- name: Start worker services
|
||||
@ -61,3 +109,22 @@
|
||||
state: present
|
||||
remove_orphans: true
|
||||
pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"
|
||||
|
||||
- name: Stop camoufox services (deprecated)
|
||||
community.docker.docker_compose_v2:
|
||||
project_src: "{{ airflow_worker_dir }}"
|
||||
files:
|
||||
- configs/docker-compose.camoufox.yaml
|
||||
state: absent
|
||||
ignore_errors: true
|
||||
|
||||
- name: Update Airflow variable with S3 worker hostnames
|
||||
ansible.builtin.command: >
|
||||
docker compose exec -T airflow-scheduler
|
||||
airflow variables set s3_worker_hostnames
|
||||
'{{ groups["airflow_workers"] | map("regex_replace", "\\..*", "") | list | to_json }}'
|
||||
args:
|
||||
chdir: "{{ airflow_master_dir }}"
|
||||
become: yes
|
||||
delegate_to: "{{ groups['airflow_master'][0] }}"
|
||||
run_once: true
|
||||
|
||||
@ -189,6 +189,17 @@
|
||||
- name: Restart Airflow worker on WORKER to apply hook
|
||||
when: inventory_hostname in groups['airflow_workers']
|
||||
ansible.builtin.command:
|
||||
cmd: "docker compose restart airflow-worker-dl airflow-worker-auth"
|
||||
cmd: "docker compose restart airflow-worker-dl airflow-worker-auth airflow-worker-mgmt airflow-worker-s3"
|
||||
chdir: "{{ airflow_worker_dir }}"
|
||||
become: yes
|
||||
|
||||
- name: Update Airflow variable with S3 worker hostnames
|
||||
ansible.builtin.command: >
|
||||
docker compose exec -T airflow-scheduler
|
||||
airflow variables set s3_worker_hostnames
|
||||
'{{ groups["airflow_workers"] | map("regex_replace", "\\..*", "") | list | to_json }}'
|
||||
args:
|
||||
chdir: "{{ airflow_master_dir }}"
|
||||
become: yes
|
||||
when: inventory_hostname in groups['airflow_master']
|
||||
run_once: true
|
||||
|
||||
@ -68,6 +68,6 @@
|
||||
- name: Restart Airflow worker on WORKER
|
||||
when: inventory_hostname in groups['airflow_workers']
|
||||
ansible.builtin.command:
|
||||
cmd: "docker compose restart airflow-worker-dl airflow-worker-auth"
|
||||
cmd: "docker compose restart airflow-worker-dl airflow-worker-auth airflow-worker-mgmt airflow-worker-s3"
|
||||
chdir: "{{ airflow_worker_dir }}"
|
||||
become: yes
|
||||
|
||||
@ -13,6 +13,22 @@
|
||||
debug:
|
||||
msg: "Starting deployment for Airflow Master: {{ inventory_hostname }} ({{ ansible_host }})"
|
||||
|
||||
- name: Install python3-pip
|
||||
ansible.builtin.apt:
|
||||
name: python3-pip
|
||||
state: present
|
||||
become: yes
|
||||
|
||||
- name: Install required python packages
|
||||
ansible.builtin.pip:
|
||||
name:
|
||||
- thrift
|
||||
- aria2p
|
||||
- PyYAML
|
||||
- apache-airflow-providers-amazon
|
||||
state: present
|
||||
become: yes
|
||||
|
||||
- name: Configure Redis memory overcommit setting
|
||||
copy:
|
||||
src: "configs/etc/sysctl.d/99-redis-overcommit.conf"
|
||||
@ -156,6 +172,20 @@
|
||||
mode: '0755'
|
||||
become: yes
|
||||
|
||||
- name: Ensure runtime data directories exist with correct ownership
|
||||
ansible.builtin.file:
|
||||
path: "{{ airflow_master_dir }}/{{ item }}"
|
||||
state: directory
|
||||
owner: "{{ ansible_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: '0775'
|
||||
recurse: yes
|
||||
loop:
|
||||
- "downloadfiles"
|
||||
- "inputfiles"
|
||||
- "dumps"
|
||||
become: yes
|
||||
|
||||
- name: Sync python packages to master for build context
|
||||
ansible.posix.synchronize:
|
||||
src: "../{{ item }}/"
|
||||
@ -249,14 +279,58 @@
|
||||
var: config_generator_result.stdout_lines
|
||||
when: config_generator_result.changed
|
||||
|
||||
- name: Start ytdlp-ops services on master
|
||||
community.docker.docker_compose_v2:
|
||||
project_src: "{{ airflow_master_dir }}"
|
||||
files:
|
||||
- configs/docker-compose-ytdlp-ops.yaml
|
||||
state: present
|
||||
remove_orphans: true
|
||||
pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"
|
||||
|
||||
roles:
|
||||
- ytdlp-master
|
||||
- airflow-master
|
||||
|
||||
post_tasks:
|
||||
- name: Include camoufox verification tasks
|
||||
include_tasks: tasks/verify_camoufox.yml
|
||||
when: not fast_deploy | default(false)
|
||||
- name: Delete existing Airflow redis_default connection to ensure an idempotent update
|
||||
ansible.builtin.command: >
|
||||
docker compose exec -T airflow-scheduler
|
||||
airflow connections delete redis_default
|
||||
args:
|
||||
chdir: "{{ airflow_master_dir }}"
|
||||
register: delete_redis_conn
|
||||
retries: 5
|
||||
delay: 10
|
||||
until: delete_redis_conn.rc == 0 or 'not found' in delete_redis_conn.stderr
|
||||
changed_when: "'was deleted successfully' in delete_redis_conn.stdout"
|
||||
failed_when:
|
||||
- delete_redis_conn.rc != 0
|
||||
- "'not found' not in delete_redis_conn.stderr"
|
||||
become: yes
|
||||
become_user: "{{ ansible_user }}"
|
||||
|
||||
- name: Add Airflow redis_default connection
|
||||
ansible.builtin.command: >
|
||||
docker compose exec -T airflow-scheduler
|
||||
airflow connections add redis_default
|
||||
--conn-uri 'redis://:{{ vault_redis_password }}@{{ ansible_host }}:{{ redis_port }}/{{ redis_db_celery_broker | default(1) }}'
|
||||
args:
|
||||
chdir: "{{ airflow_master_dir }}"
|
||||
register: add_redis_conn
|
||||
retries: 5
|
||||
delay: 10
|
||||
until: add_redis_conn.rc == 0
|
||||
changed_when: "'was successfully added' in add_redis_conn.stdout"
|
||||
become: yes
|
||||
become_user: "{{ ansible_user }}"
|
||||
|
||||
- name: Update S3 delivery connection
|
||||
ansible.builtin.import_playbook: playbook-update-s3-vars.yml
|
||||
|
||||
# - name: Include camoufox verification tasks
|
||||
# include_tasks: tasks/verify_camoufox.yml
|
||||
# when: not fast_deploy | default(false)
|
||||
|
||||
- name: Run regression test
|
||||
command: >
|
||||
|
||||
58
ansible/playbook-update-s3-vars.yml
Normal file
58
ansible/playbook-update-s3-vars.yml
Normal file
@ -0,0 +1,58 @@
|
||||
---
|
||||
- name: Update S3 Delivery Airflow Connection
|
||||
hosts: airflow_master
|
||||
vars_files:
|
||||
- "{{ inventory_dir }}/group_vars/all/vault.yml"
|
||||
- "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
|
||||
tasks:
|
||||
- name: Delete existing s3_delivery_connection to ensure an idempotent update
|
||||
ansible.builtin.command: >
|
||||
docker compose exec -T airflow-scheduler
|
||||
airflow connections delete s3_delivery_connection
|
||||
args:
|
||||
chdir: "{{ airflow_master_dir }}"
|
||||
register: delete_s3_conn
|
||||
retries: 5
|
||||
delay: 10
|
||||
until: delete_s3_conn.rc == 0 or 'Did not find a connection' in delete_s3_conn.stderr
|
||||
changed_when: "'was deleted successfully' in delete_s3_conn.stdout"
|
||||
failed_when:
|
||||
- delete_s3_conn.rc != 0
|
||||
- "'Did not find a connection' not in delete_s3_conn.stderr"
|
||||
become: yes
|
||||
become_user: "{{ ansible_user }}"
|
||||
|
||||
- name: Add/Update s3_delivery_connection
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- docker
|
||||
- compose
|
||||
- exec
|
||||
- -T
|
||||
- airflow-scheduler
|
||||
- airflow
|
||||
- connections
|
||||
- add
|
||||
- s3_delivery_connection
|
||||
- --conn-type
|
||||
- aws
|
||||
- --conn-login
|
||||
- "{{ vault_s3_delivery_access_key_id }}"
|
||||
- --conn-password
|
||||
- "{{ vault_s3_delivery_secret_access_key }}"
|
||||
- --conn-host
|
||||
- "{{ vault_s3_delivery_endpoint }}"
|
||||
- --conn-extra
|
||||
- "{{ s3_extra_dict | to_json }}"
|
||||
chdir: "{{ airflow_master_dir }}"
|
||||
vars:
|
||||
s3_extra_dict:
|
||||
bucket: "{{ vault_s3_delivery_bucket }}"
|
||||
region_name: "{{ vault_s3_delivery_aws_region }}"
|
||||
register: add_s3_conn
|
||||
retries: 5
|
||||
delay: 10
|
||||
until: add_s3_conn.rc == 0
|
||||
changed_when: "'was successfully added' in add_s3_conn.stdout"
|
||||
become: yes
|
||||
become_user: "{{ ansible_user }}"
|
||||
@ -147,6 +147,23 @@
|
||||
mode: '0755'
|
||||
become: yes
|
||||
|
||||
- name: Ensure runtime data directories exist with correct ownership
|
||||
ansible.builtin.file:
|
||||
path: "{{ airflow_worker_dir }}/{{ item }}"
|
||||
state: directory
|
||||
owner: "{{ ansible_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: '0775'
|
||||
recurse: yes
|
||||
loop:
|
||||
- "downloadfiles"
|
||||
- "downloadfiles/videos"
|
||||
- "downloadfiles/videos/in-progress"
|
||||
- "downloadfiles/videos/ready"
|
||||
- "inputfiles"
|
||||
- "dumps"
|
||||
become: yes
|
||||
|
||||
- name: Create .dockerignore on worker to exclude runtime data from build context
|
||||
ansible.builtin.copy:
|
||||
dest: "{{ airflow_worker_dir }}/.dockerignore"
|
||||
|
||||
@ -57,8 +57,8 @@ YTDLP_TIMEOUT=600
|
||||
# --- Camoufox (Browser) Configuration ---
|
||||
CAMOUFOX_PROXIES="{{ (worker_proxies | default([])) | join(',') }}"
|
||||
VNC_PASSWORD="{{ vault_vnc_password }}"
|
||||
CAMOUFOX_BASE_VNC_PORT={{ camoufox_base_vnc_port }}
|
||||
CAMOUFOX_PORT={{ camoufox_base_port }}
|
||||
CAMOUFOX_BASE_VNC_PORT={{ camoufox_base_vnc_port | default(5901) }}
|
||||
CAMOUFOX_PORT={{ camoufox_base_port | default(9070) }}
|
||||
|
||||
# --- Account Manager Configuration ---
|
||||
ACCOUNT_ACTIVE_DURATION_MIN={{ account_active_duration_min | default(7) }}
|
||||
|
||||
10
cli.config
10
cli.config
@ -18,8 +18,9 @@
|
||||
# Retry fragments 10 times
|
||||
--fragment-retries 10
|
||||
|
||||
# Limit download rate to 5M
|
||||
--limit-rate 5M
|
||||
# Use a fixed buffer size to stabilize throughput and avoid traffic shaping
|
||||
--no-resize-buffer
|
||||
--buffer-size 4M
|
||||
|
||||
# Socket timeout
|
||||
--socket-timeout 15
|
||||
@ -31,5 +32,10 @@
|
||||
# Progress
|
||||
--progress
|
||||
|
||||
# Merge to mp4 by default
|
||||
--merge-output-format mp4
|
||||
|
||||
# Don't use "NA" in filenames if metadata is missing
|
||||
--output-na-placeholder ""
|
||||
|
||||
--no-part
|
||||
|
||||
58
policies/4_custom_scenarios.yaml
Normal file
58
policies/4_custom_scenarios.yaml
Normal file
@ -0,0 +1,58 @@
|
||||
# This file contains custom policies for specific testing scenarios.
|
||||
|
||||
---
|
||||
# Policy: Fetch info.json with visitor ID rotation.
|
||||
# This policy uses a single worker to fetch info.json files for a list of URLs.
|
||||
# It simulates user churn by creating a new profile (and thus a new visitor_id and POT)
|
||||
# every 250 requests. A short sleep is used between requests.
|
||||
name: fetch_with_visitor_id_rotation
|
||||
|
||||
settings:
|
||||
mode: fetch_only
|
||||
urls_file: "urls.txt" # Placeholder, should be overridden with --set
|
||||
info_json_script: "bin/ytops-client get-info"
|
||||
save_info_json_dir: "fetched_info_jsons/visitor_id_rotation"
|
||||
# Use the modern profile management system to rotate visitor_id.
|
||||
profile_mode: per_worker_with_rotation
|
||||
profile_management:
|
||||
prefix: "visitor_rotator"
|
||||
# Rotate to a new profile generation after 250 requests.
|
||||
max_requests_per_profile: 250
|
||||
|
||||
execution_control:
|
||||
run_until: { cycles: 1 } # Run through the URL list once.
|
||||
workers: 1 # Run with a single worker thread.
|
||||
# A short, fixed sleep between each info.json request.
|
||||
sleep_between_tasks: { min_seconds: 0.75, max_seconds: 0.75 }
|
||||
|
||||
info_json_generation_policy:
|
||||
# Use a standard client. The server will handle token generation.
|
||||
client: web
|
||||
|
||||
---
|
||||
# Policy: Test download specific DASH formats from a folder of info.jsons.
|
||||
# This policy uses a single worker to test-download a list of video-only DASH
|
||||
# formats from a directory of existing info.json files. It only downloads the
|
||||
# first 10KB of each format and sleeps between each file.
|
||||
name: download_dashy_formats_test
|
||||
|
||||
settings:
|
||||
mode: download_only
|
||||
# Directory of info.json files to process.
|
||||
info_json_dir: "fetched_info_jsons/visitor_id_rotation" # Assumes output from the above policy
|
||||
|
||||
execution_control:
|
||||
run_until: { cycles: 1 } # Run through the info.json directory once.
|
||||
workers: 1 # Run with a single worker thread.
|
||||
# A longer, randomized sleep between processing each info.json file.
|
||||
sleep_between_tasks: { min_seconds: 5, max_seconds: 10 }
|
||||
|
||||
download_policy:
|
||||
# A specific list of video-only DASH formats to test.
|
||||
# The "-dashy" suffix is illustrative; the format IDs must exist in the info.json.
|
||||
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy"
|
||||
# Use the native Python downloader for better performance and control.
|
||||
downloader: "native-py"
|
||||
# Pass extra arguments to yt-dlp to perform a "test" download (first 10KB).
|
||||
extra_args: '--download-sections "*0-10240"'
|
||||
output_dir: "downloads/dash_test"
|
||||
1
thrift_model/.gitignore
vendored
1
thrift_model/.gitignore
vendored
@ -1,2 +1 @@
|
||||
__py_cache__
|
||||
target/
|
||||
|
||||
@ -179,13 +179,63 @@ def main_download_native_py(args):
|
||||
ydl_opts = {}
|
||||
if base_opts_args:
|
||||
try:
|
||||
# This is an internal API, but it's the most accurate way to parse CLI args
|
||||
# into the ydl_opts dictionary format.
|
||||
# yt-dlp's parse_options can return 3 or 4 values. We only need the `opts` namespace (second value).
|
||||
_parser, opts, _args, *_ = yt_dlp.parse_options(base_opts_args)
|
||||
ydl_opts = vars(opts)
|
||||
logger.info(f"Parsing {len(base_opts_args)} arguments from config/extra_args...")
|
||||
i = 0
|
||||
while i < len(base_opts_args):
|
||||
arg = base_opts_args[i]
|
||||
if not arg.startswith('--'):
|
||||
logger.warning(f"Skipping non-option argument in extra args: {arg}")
|
||||
i += 1
|
||||
continue
|
||||
|
||||
key = arg.lstrip('-').replace('-', '_')
|
||||
|
||||
# Handle flags (no value)
|
||||
is_flag = i + 1 >= len(base_opts_args) or base_opts_args[i + 1].startswith('--')
|
||||
|
||||
if is_flag:
|
||||
if key.startswith('no_'):
|
||||
# Handle --no-foo flags
|
||||
ydl_opts[key[3:]] = False
|
||||
else:
|
||||
ydl_opts[key] = True
|
||||
logger.debug(f"Parsed flag: {key} = {ydl_opts.get(key[3:] if key.startswith('no_') else key)}")
|
||||
i += 1
|
||||
# Handle options with values
|
||||
else:
|
||||
value = base_opts_args[i + 1]
|
||||
# Try to convert values to numbers, which yt-dlp expects.
|
||||
# This includes parsing byte suffixes like 'K', 'M', 'G'.
|
||||
if isinstance(value, str):
|
||||
original_value = value
|
||||
value_upper = value.upper()
|
||||
multipliers = {'K': 1024, 'M': 1024**2, 'G': 1024**3, 'T': 1024**4}
|
||||
|
||||
if value_upper and value_upper[-1] in multipliers:
|
||||
try:
|
||||
num = float(value[:-1])
|
||||
value = int(num * multipliers[value_upper[-1]])
|
||||
except (ValueError, TypeError):
|
||||
value = original_value # fallback
|
||||
else:
|
||||
try:
|
||||
value = int(value)
|
||||
except (ValueError, TypeError):
|
||||
try:
|
||||
value = float(value)
|
||||
except (ValueError, TypeError):
|
||||
value = original_value # fallback
|
||||
|
||||
# Special handling for keys that differ from CLI arg, e.g. --limit-rate -> ratelimit
|
||||
if key == 'limit_rate':
|
||||
key = 'ratelimit'
|
||||
|
||||
ydl_opts[key] = value
|
||||
logger.debug(f"Parsed option: {key} = {value}")
|
||||
i += 2
|
||||
logger.info("Successfully parsed extra yt-dlp options.")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse options from config/extra_args: {e}")
|
||||
logger.error(f"Failed to parse options from config/extra_args: {e}", exc_info=True)
|
||||
return 1
|
||||
|
||||
# Now, layer the script's explicit arguments on top, as they have higher precedence.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user