yt-dlp-dags/ytops_client-source/policies/11_direct_docker_download_simulation.yaml
2025-12-27 16:57:26 +03:00

109 lines
4.6 KiB
YAML

# Policy: Continuous Download Simulation via Direct Docker Exec
#
# This policy simulates a continuous stream of downloads using the
# 'direct_docker_cli' mode with `mode: download_only`. It finds task files
# (info.jsons) in a directory and invokes a yt-dlp command inside a running
# Docker container to perform the download.
#
name: direct_docker_download_simulation
settings:
mode: download_only
orchestration_mode: direct_docker_cli
profile_mode: from_pool_with_lock
# This directory should contain info.json files generated by an auth simulation,
# like `10_direct_docker_auth_simulation`.
# It MUST be inside the docker_host_mount_path.
info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation"
# Regex to extract the profile name from a task filename. The first capture
# group is used. This is crucial for the task-first locking strategy.
# It looks for a component that starts with 'user' between two hyphens.
profile_extraction_regex: '^.+?-(user[^-]+)-'
execution_control:
workers: 1
# How long a worker should pause if it cannot find an available profile or task.
worker_polling_interval_seconds: 1
download_policy:
profile_prefix: "user1"
# A comma-separated list of format IDs to download for each info.json.
# This is used by the dummy mode simulation to test per-format downloads.
# In non-dummy mode, the format selector in ytdlp_config_overrides is used.
formats: "140-dashy,299-dashy"
# Default cooldown in seconds if not specified by the enforcer in Redis.
# The value from Redis (set via `unlock_cooldown_seconds` in the enforcer policy)
# will always take precedence. This is a fallback.
# Can be an integer (e.g., 1) or a range (e.g., [1, 3]).
default_unlock_cooldown_seconds: 1
# If true, check if the download URL in the info.json is expired before
# attempting to download. This is enabled by default.
check_url_expiration: true
# --- Airflow Integration ---
# If true, move downloaded media and info.json to a timestamped, video-id-based
# directory structure that the Airflow DAGs can process.
output_to_airflow_ready_dir: true
airflow_ready_dir_base_path: "downloadfiles/videos/ready"
simulation_parameters:
download_env: "sim_download"
direct_docker_cli_policy:
# Which simulation environment's profiles to use for locking.
use_profile_env: "download"
# If true, a worker will try to lock a different profile than the one it just used.
# This is disabled for downloads, as the cooldown mechanism is sufficient.
avoid_immediate_profile_reuse: false
# How long the worker should wait for a different profile before re-using the same one.
avoid_reuse_max_wait_seconds: 5
# NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
# sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
# For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
# This enforcer policy should be configured to respect these limits via
# rotation and rest periods.
# --- Docker Execution Settings ---
docker_image_name: "ytops/yt-dlp"
docker_network_name: "airflow_proxynet"
# Host path mounted into the container for task files (info.json, config).
# IMPORTANT: This must be the SAME host path used for the `info_json_dir` above,
# or a parent directory of it, so the container can see the task files.
docker_host_mount_path: "run/docker_mount"
docker_container_mount_path: "/config"
# Path on the HOST where downloaded files will be saved.
docker_host_download_path: "downloaded_media/direct_docker_simulation"
# Path inside the CONTAINER where `docker_host_download_path` is mounted.
docker_container_download_path: "/downloads"
# A base config file can be used, with overrides applied from the policy.
# The orchestrator will inject 'proxy', 'load-info-json', and 'output' keys into the overrides.
ytdlp_config_file: "cli.download.config"
ytdlp_config_overrides:
format: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy,140-dashy/140-dashy-0/140"
#format: "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
no-resize-buffer: true
buffer-size: "4M"
concurrent-fragments: 8
ytdlp_raw_args: []
# --- Live Error Parsing Rules ---
# If a fatal error is detected, immediately ban the profile to stop the container.
ban_on_fatal_error_in_batch: true
fatal_error_patterns:
- "HTTP Error 403"
- "HTTP Error 502"
tolerated_error_patterns:
- "timed out"
- "Timeout"
- "connection reset by peer"
- "Invalid data found when processing input"
- "Error opening input files"
simulation_parameters:
download_env: "sim_download"