212 lines
9.6 KiB
YAML
212 lines
9.6 KiB
YAML
# Policy: Queue-based Authentication Simulation
|
||
#
|
||
# This policy simulates a continuous stream of info.json fetch requests. It pulls
|
||
# URLs from a Redis queue and processes them, acting as the first stage in a
|
||
# two-stage simulation. The second stage (downloading) can be handled in one
|
||
# of two ways, configured below:
|
||
#
|
||
# --- WORKFLOW 1: Queue-Auth -> File-Download ---
|
||
# - This policy creates info.json files in a shared directory (`save_info_json_dir`).
|
||
# - A separate download simulation (e.g., policy 11_direct_docker_download_simulation.yaml)
|
||
# watches that directory, picks up the files, and performs the downloads.
|
||
# - To enable:
|
||
# - Set `create_download_tasks: false`
|
||
# - Ensure `save_info_json_dir` points to a shared path.
|
||
#
|
||
# --- WORKFLOW 2: Queue-Auth -> Queue-Download ---
|
||
# - This policy creates download *tasks* and pushes them to another Redis queue.
|
||
# - A separate download simulation (e.g., policy 13_queue_download_simulation.yaml)
|
||
# pulls tasks from that queue and performs the downloads.
|
||
# - To enable:
|
||
# - Set `create_download_tasks: true`
|
||
# - Configure `download_task_queue` to the correct queue name.
|
||
# - Use `download_task_granularity` to control if one task is created per-URL
|
||
# or per-format.
|
||
#
|
||
name: 12_queue_auth_simulation
|
||
|
||
settings:
|
||
mode: fetch_only
|
||
orchestration_mode: queue_auth
|
||
profile_mode: from_pool_with_lock
|
||
# For Queue-Auth -> File-Download workflow: Directory to save generated info.json files.
|
||
# A file-based download worker (e.g., policy 11) will watch this directory.
|
||
# This directory MUST be inside the docker_host_mount_path.
|
||
# NOTE: This path is expected to be on an s3fs mount for cross-host communication.
|
||
save_info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation"
|
||
|
||
execution_control:
|
||
# Automatically discover profile groups from Redis and create workers for them.
|
||
# This avoids having to list each profile group (e.g., user31, user32) manually.
|
||
worker_pool_discovery:
|
||
# A glob-style pattern to find profile prefixes in Redis.
|
||
profile_prefix_pattern: "user*"
|
||
# Number of workers to assign to each discovered profile prefix group.
|
||
workers_per_profile_group: 1
|
||
# How long a worker should pause if it cannot find an available profile to lock.
|
||
worker_polling_interval_seconds: 1
|
||
# No sleep between tasks; throughput is controlled by yt-dlp performance and profile availability.
|
||
|
||
info_json_generation_policy:
|
||
# This setting tells the auth worker how many download tasks will be generated
|
||
# per successful info.json. It is used to correctly increment the
|
||
# 'pending_downloads' counter on the auth profile.
|
||
# Can be an integer, or 'from_download_policy' to automatically count formats
|
||
# from the 'download_policy.formats' setting in this same policy file.
|
||
downloads_per_url: "from_download_policy"
|
||
# (For Queue-Download workflow) Controls how download tasks are created.
|
||
#
|
||
# "per_format": (Default) Creates one download task for EACH format specified in 'formats_to_download'.
|
||
# If `formats_to_download` is "140,299", two download tasks are created, and
|
||
# the 'pending_downloads' counter is incremented by 2.
|
||
#
|
||
# "per_url": Creates a SINGLE download task for the entire URL. The 'formats_to_download'
|
||
# string is passed to the download worker as the format selector, but 'pending_downloads'
|
||
# is only incremented by 1 for the whole URL.
|
||
#
|
||
# --- Current Setting ---
|
||
download_task_granularity: "per_format"
|
||
# --- Alternative Setting (commented out) ---
|
||
# download_task_granularity: "per_url"
|
||
# profile_prefix is now defined per-pool in execution_control.worker_pools
|
||
# However, for queue auth mode, we need a fallback prefix
|
||
profile_prefix: "user"
|
||
|
||
# This section is needed for the 'downloads_per_url: from_download_policy' setting.
|
||
# It should mirror the formats being used by the download simulation.
|
||
download_policy:
|
||
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy,140-dashy/140-dashy-0/140"
|
||
|
||
direct_docker_cli_policy:
|
||
# Which simulation environment's profiles to use for locking.
|
||
use_profile_env: "auth"
|
||
|
||
# If true, a worker will try to lock a different profile than the one it just used.
|
||
avoid_immediate_profile_reuse: true
|
||
# How long the worker should wait for a different profile before re-using the same one.
|
||
avoid_reuse_max_wait_seconds: 5
|
||
|
||
# NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
|
||
# sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
|
||
# For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
|
||
# The enforcer policy (e.g., 8_unified_simulation_enforcer.yaml) should be
|
||
# configured to respect these limits via rotation and rest periods.
|
||
|
||
# If true, extract the visitor_id from yt-dlp logs, save it per-profile,
|
||
# and inject it into subsequent requests for that profile.
|
||
#track_visitor_id: true
|
||
|
||
# --- Docker Execution Settings ---
|
||
docker_image_name: "ytops/yt-dlp" # Image to use for `docker run`
|
||
docker_network_name: "airflow_proxynet"
|
||
# IMPORTANT: This path on the HOST will be mounted into the container at `docker_container_mount_path`.
|
||
docker_host_mount_path: "run/docker_mount"
|
||
docker_container_mount_path: "/config" # The mount point inside the container
|
||
|
||
# Host path for persisting cache data (e.g., cookies, sigfuncs) between runs.
|
||
# NOTE: This path should be on a fast, local disk, NOT on s3fs.
|
||
docker_host_cache_path: ".cache/queue_auth_simulation"
|
||
# Path inside the container where the cache is mounted. Should match HOME/.cache
|
||
docker_container_cache_path: "/config/.cache"
|
||
|
||
# If true, create and use a persistent cookie jar per profile inside the cache dir.
|
||
# use_cookies: true
|
||
|
||
# --- User-Agent Generation ---
|
||
# Template for generating User-Agent strings for new profiles.
|
||
# The '{major_version}' will be replaced by a version string.
|
||
user_agent_template: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{major_version}.0.0.0 Safari/537.36"
|
||
# Range of Chrome major versions to use for the template.
|
||
# See CHROME_MAJOR_VERSION_RANGE in yt-dlp's random_user_agent():
|
||
# https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/utils/networking.py
|
||
user_agent_version_range: [137, 143]
|
||
|
||
# A base config file can be used, with overrides applied from the policy.
|
||
# The orchestrator will inject 'proxy', 'batch-file', and 'output' keys into the overrides.
|
||
ytdlp_config_file: "cli.auth.config"
|
||
ytdlp_config_overrides:
|
||
skip-download: true
|
||
write-info-json: true
|
||
no-write-subs: true
|
||
no-color: true
|
||
ignore-errors: true
|
||
use-extractors: ["youtube"]
|
||
|
||
ytdlp_raw_args:
|
||
- '--extractor-args "youtube:formats=duplicate;jsc_trace=true;player_client=tv_simply;pot_trace=true;skip=translated_subs,hls"'
|
||
- '--extractor-args "youtubepot-bgutilhttp:base_url=http://172.17.0.1:4416"'
|
||
- '--sleep-requests 0.75'
|
||
# --retry-sleep linear=1::2'
|
||
|
||
# --- Live Error Parsing Rules ---
|
||
# These regex patterns are checked against yt-dlp's stderr in real-time.
|
||
# If a fatal error is detected, immediately ban the profile to stop the container
|
||
# and prevent further errors in the same batch.
|
||
ban_on_fatal_error_in_batch: true
|
||
fatal_error_patterns:
|
||
- "Sign in to confirm you’re not a bot"
|
||
- "rate-limited by YouTube"
|
||
- "This content isn't available, try again later"
|
||
- "HTTP Error 502"
|
||
|
||
tolerated_error_patterns:
|
||
- "HTTP Error 429"
|
||
- "The uploader has not made this video available in your country"
|
||
- "This video has been removed by the uploader"
|
||
- "Private video"
|
||
- "This is a private video"
|
||
- "Video is private"
|
||
- "Video unavailable"
|
||
- "account associated with this video has been terminated"
|
||
- "members-only content"
|
||
- "Sign in to confirm your age"
|
||
|
||
# Template for renaming the final info.json.
|
||
rename_file_template: "{video_id}-{profile_name}-{proxy}.info.json"
|
||
|
||
# Settings for controlling the behavior of dummy/simulation modes.
|
||
# These values can be overridden at runtime with the --set flag.
|
||
dummy_simulation_settings:
|
||
# Timings for dummy auth simulation (per-URL delay in a batch)
|
||
auth_min_seconds: 0.1
|
||
auth_max_seconds: 0.5
|
||
auth_failure_rate: 0.0
|
||
auth_skipped_failure_rate: 0.0
|
||
# Timings for dummy download simulation (per-format download time)
|
||
download_min_seconds: 1.0
|
||
download_max_seconds: 3.0
|
||
download_failure_rate: 0.0
|
||
download_skipped_failure_rate: 0.0
|
||
|
||
queue_policy:
|
||
# Set to false to use legacy, unprefixed queue names (e.g., 'queue2_auth_inbox').
|
||
# Set to true (or omit) to use environment-prefixed names (e.g., 'sim_auth_queue2_auth_inbox').
|
||
use_env_prefix: false
|
||
|
||
# Queue to pull URLs from
|
||
input_queue: "queue2_auth_inbox"
|
||
|
||
# --- Download Handoff Configuration ---
|
||
# Set to 'true' for Queue-Auth -> Queue-Download workflow.
|
||
# Set to 'false' for Queue-Auth -> File-Download workflow.
|
||
create_download_tasks: false
|
||
|
||
# Queue to push download tasks to (if create_download_tasks is true)
|
||
download_task_queue: "queue2_dl_inbox"
|
||
|
||
# How many tasks a worker should pull from the queue at once.
|
||
# This will become the batch size for the docker run.
|
||
batch_size: 5
|
||
|
||
# If specified, create download tasks for these formats
|
||
# Can be "all", a specific format ID, or a list of format IDs
|
||
# Defaults to the formats in download_policy.formats
|
||
# Example: formats_to_download: "140-dashy,299-dashy"
|
||
# Example: formats_to_download: "all"
|
||
# Example: formats_to_download: ["140-dashy", "299-dashy"]
|
||
formats_to_download: "from_download_policy"
|
||
|
||
simulation_parameters:
|
||
auth_env: "sim_auth"
|
||
download_env: "sim_download"
|