148 lines
5.9 KiB
YAML
148 lines
5.9 KiB
YAML
# Policy: Queue-based Download Simulation
|
|
#
|
|
# This policy simulates a continuous stream of downloads. It pulls download tasks
|
|
# from a Redis queue, where each task typically contains a path to an info.json
|
|
# file and a format to download.
|
|
#
|
|
# This policy is designed to be the *second stage* of a two-stage simulation,
|
|
# consuming tasks produced by an authentication simulation like:
|
|
# - `12_queue_auth_simulation.yaml` (when configured for Queue-Download workflow)
|
|
#
|
|
# It does not matter to this policy whether the auth stage created tasks per-URL
|
|
# or per-format; this worker will simply process whatever task it receives from
|
|
# the `input_queue`.
|
|
#
|
|
name: 13_queue_download_simulation
|
|
|
|
settings:
|
|
mode: download_only
|
|
orchestration_mode: queue_download
|
|
profile_mode: from_pool_with_lock
|
|
# In queue mode, info_json_dir is not used to find tasks.
|
|
# However, the paths inside the download tasks must be accessible
|
|
# within the docker_host_mount_path.
|
|
# The profile_extraction_regex is also not needed as the profile
|
|
# can be specified in the download task.
|
|
|
|
execution_control:
|
|
# Define worker pools for multiple user groups
|
|
worker_pools:
|
|
- profile_prefix: "user1"
|
|
workers: 1
|
|
- profile_prefix: "user2"
|
|
workers: 1
|
|
- profile_prefix: "user3"
|
|
workers: 1
|
|
# How long a worker should pause if it cannot find an available profile or task.
|
|
worker_polling_interval_seconds: 1
|
|
|
|
download_policy:
|
|
# profile_prefix is now defined per-pool in execution_control.worker_pools
|
|
# Default cooldown in seconds if not specified by the enforcer in Redis.
|
|
# The value from Redis (set via `unlock_cooldown_seconds` in the enforcer policy)
|
|
# will always take precedence. This is a fallback.
|
|
# Can be an integer (e.g., 1) or a range (e.g., [1, 3]).
|
|
default_unlock_cooldown_seconds: 1
|
|
# If true, check if the download URL in the info.json is expired before
|
|
# attempting to download. This is enabled by default.
|
|
check_url_expiration: true
|
|
# --- Airflow Integration ---
|
|
# If true, move downloaded media and info.json to a timestamped, video-id-based
|
|
# directory structure that the Airflow DAGs can process.
|
|
output_to_airflow_ready_dir: true
|
|
airflow_ready_dir_base_path: "downloadfiles/videos/ready"
|
|
|
|
direct_docker_cli_policy:
|
|
# Which simulation environment's profiles to use for locking.
|
|
use_profile_env: "download"
|
|
|
|
# If true, a worker will try to lock a different profile than the one it just used.
|
|
# This is disabled for downloads, as the cooldown mechanism is sufficient.
|
|
avoid_immediate_profile_reuse: false
|
|
# How long the worker should wait for a different profile before re-using the same one.
|
|
avoid_reuse_max_wait_seconds: 5
|
|
|
|
# NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
|
|
# sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
|
|
# For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
|
|
# This enforcer policy should be configured to respect these limits via
|
|
# rotation and rest periods.
|
|
|
|
# --- Docker Execution Settings ---
|
|
docker_image_name: "ytops/yt-dlp"
|
|
docker_network_name: "airflow_proxynet"
|
|
# Host path mounted into the container for task files (info.json, config).
|
|
# IMPORTANT: This must be the SAME host path used for the `info_json_dir` above,
|
|
# or a parent directory of it, so the container can see the task files.
|
|
docker_host_mount_path: "run/docker_mount"
|
|
docker_container_mount_path: "/config"
|
|
|
|
# Path on the HOST where downloaded files will be saved.
|
|
# NOTE: This path should be on a fast, local disk, NOT on s3fs.
|
|
docker_host_download_path: "downloaded_media/queue_downloads"
|
|
# Path inside the CONTAINER where `docker_host_download_path` is mounted.
|
|
docker_container_download_path: "/downloads"
|
|
|
|
# A base config file can be used, with overrides applied from the policy.
|
|
# The orchestrator will inject 'proxy', 'load-info-json', and 'output' keys into the overrides.
|
|
ytdlp_config_file: "cli.download.config"
|
|
ytdlp_config_overrides:
|
|
format: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy,140-dashy/140-dashy-0/140"
|
|
#format: "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
|
|
no-resize-buffer: true
|
|
buffer-size: "4M"
|
|
concurrent-fragments: 8
|
|
|
|
ytdlp_raw_args: []
|
|
|
|
# --- Live Error Parsing Rules ---
|
|
# If a fatal error is detected, immediately ban the profile to stop the container.
|
|
ban_on_fatal_error_in_batch: true
|
|
fatal_error_patterns:
|
|
- "HTTP Error 403"
|
|
- "HTTP Error 502"
|
|
|
|
tolerated_error_patterns:
|
|
- "timed out"
|
|
- "Timeout"
|
|
- "connection reset by peer"
|
|
- "Invalid data found when processing input"
|
|
- "Error opening input files"
|
|
|
|
# Settings for controlling the behavior of dummy/simulation modes.
|
|
# These values can be overridden at runtime with the --set flag.
|
|
dummy_simulation_settings:
|
|
# Timings for dummy download simulation (per-format download time)
|
|
download_min_seconds: 1.0
|
|
download_max_seconds: 3.0
|
|
download_failure_rate: 0.0
|
|
download_skipped_failure_rate: 0.0
|
|
|
|
queue_policy:
|
|
# Set to false to use legacy, unprefixed queue names (e.g., 'queue2_dl_inbox').
|
|
# Set to true (or omit) to use environment-prefixed names (e.g., 'sim_download_queue2_dl_inbox').
|
|
use_env_prefix: false
|
|
|
|
# Queue to pull download tasks from
|
|
input_queue: "queue2_dl_inbox"
|
|
|
|
# Whether to report completion back to a queue
|
|
# Can be true (report all), false (report none), or "success_only"/"failure_only"
|
|
report_completion: true
|
|
|
|
# Queue to report completion to
|
|
completion_queue: "queue2_dl_completed"
|
|
|
|
# Queue to report failures to (always reported regardless of report_completion)
|
|
failure_queue: "queue2_dl_fail"
|
|
|
|
# Queue to report skipped tasks to
|
|
skipped_queue: "queue2_dl_skipped"
|
|
|
|
# How many tasks to process in a batch. For downloads, this should be 1,
|
|
# as each worker locks a profile for a single download task.
|
|
batch_size: 1
|
|
|
|
simulation_parameters:
|
|
download_env: "sim_download"
|