yt-dlp-dags/ytops_client-source/policies/12_queue_auth_simulation.yaml

224 lines
9.9 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Policy: Queue-based Authentication Simulation
#
# This policy simulates a continuous stream of info.json fetch requests. It pulls
# URLs from a Redis queue and processes them, acting as the first stage in a
# two-stage simulation. The second stage (downloading) can be handled in one
# of two ways, configured below:
#
# --- WORKFLOW 1: Queue-Auth -> File-Download ---
# - This policy creates info.json files in a shared directory (`save_info_json_dir`).
# - A separate download simulation (e.g., policy 11_direct_docker_download_simulation.yaml)
# watches that directory, picks up the files, and performs the downloads.
# - To enable:
# - Set `create_download_tasks: false`
# - Ensure `save_info_json_dir` points to a shared path.
#
# --- WORKFLOW 2: Queue-Auth -> Queue-Download ---
# - This policy creates download *tasks* and pushes them to another Redis queue.
# - A separate download simulation (e.g., policy 13_queue_download_simulation.yaml)
# pulls tasks from that queue and performs the downloads.
# - To enable:
# - Set `create_download_tasks: true`
# - Configure `download_task_queue` to the correct queue name.
# - Use `download_task_granularity` to control if one task is created per-URL
# or per-format.
#
name: 12_queue_auth_simulation
settings:
mode: fetch_only
orchestration_mode: queue_auth
profile_mode: from_pool_with_lock
# For Queue-Auth -> File-Download workflow: Directory to save generated info.json files.
# A file-based download worker (e.g., policy 11) will watch this directory.
# This directory MUST be inside the docker_host_mount_path.
# NOTE: This path is expected to be on an s3fs mount for cross-host communication.
save_info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation"
execution_control:
# Automatically discover profile groups from Redis and create workers for them.
# This avoids having to list each profile group (e.g., user31, user32) manually.
worker_pool_discovery:
# A glob-style pattern to find profile prefixes in Redis.
profile_prefix_pattern: "user*"
# Number of workers to assign to each discovered profile prefix group.
workers_per_profile_group: 1
# How long a worker should pause if it cannot find an available profile to lock.
worker_polling_interval_seconds: 1
# No sleep between tasks; throughput is controlled by yt-dlp performance and profile availability.
info_json_generation_policy:
# This setting tells the auth worker how many download tasks will be generated
# per successful info.json. It is used to correctly increment the
# 'pending_downloads' counter on the auth profile.
# Can be an integer, or 'from_download_policy' to automatically count formats
# from the 'download_policy.formats' setting in this same policy file.
downloads_per_url: "from_download_policy"
# (For Queue-Download workflow) Controls how download tasks are created.
#
# "per_format": (Default) Creates one download task for EACH format specified in 'formats_to_download'.
# If `formats_to_download` is "140,299", two download tasks are created, and
# the 'pending_downloads' counter is incremented by 2.
#
# "per_url": Creates a SINGLE download task for the entire URL. The 'formats_to_download'
# string is passed to the download worker as the format selector, but 'pending_downloads'
# is only incremented by 1 for the whole URL.
#
# --- Current Setting ---
download_task_granularity: "per_format"
# --- Alternative Setting (commented out) ---
# download_task_granularity: "per_url"
# profile_prefix is now defined per-pool in execution_control.worker_pools
# However, for queue auth mode, we need a fallback prefix
profile_prefix: "user"
# This section is needed for the 'downloads_per_url: from_download_policy' setting.
# It should mirror the formats being used by the download simulation.
download_policy:
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy,140-dashy/140-dashy-0/140"
direct_docker_cli_policy:
# Which simulation environment's profiles to use for locking.
use_profile_env: "auth"
# If true, a worker will try to lock a different profile than the one it just used.
avoid_immediate_profile_reuse: true
# How long the worker should wait for a different profile before re-using the same one.
avoid_reuse_max_wait_seconds: 5
# NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
# sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
# For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
# The enforcer policy (e.g., 8_unified_simulation_enforcer.yaml) should be
# configured to respect these limits via rotation and rest periods.
# If true, extract the visitor_id from yt-dlp logs, save it per-profile,
# and inject it into subsequent requests for that profile.
#track_visitor_id: true
# --- Docker Execution Settings ---
docker_image_name: "ytops/yt-dlp" # Image to use for `docker run`
docker_network_name: "airflow_proxynet"
# IMPORTANT: This path on the HOST will be mounted into the container at `docker_container_mount_path`.
docker_host_mount_path: "run/docker_mount"
docker_container_mount_path: "/config" # The mount point inside the container
# Host path for persisting cache data (e.g., cookies, sigfuncs) between runs.
# NOTE: This path should be on a fast, local disk, NOT on s3fs.
docker_host_cache_path: ".cache/queue_auth_simulation"
# Path inside the container where the cache is mounted. Should match HOME/.cache
docker_container_cache_path: "/config/.cache"
# If true, create and use a persistent cookie jar per profile inside the cache dir.
# use_cookies: true
# --- User-Agent Generation ---
# Template for generating User-Agent strings for new profiles.
# The '{major_version}' will be replaced by a version string.
user_agent_template: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{major_version}.0.0.0 Safari/537.36"
# Range of Chrome major versions to use for the template.
# See CHROME_MAJOR_VERSION_RANGE in yt-dlp's random_user_agent():
# https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/utils/networking.py
user_agent_version_range: [137, 143]
# A base config file can be used, with overrides applied from the policy.
# The orchestrator will inject 'proxy', 'batch-file', and 'output' keys into the overrides.
ytdlp_config_file: "cli.auth.config"
ytdlp_config_overrides:
skip-download: true
write-info-json: true
no-write-subs: true
no-color: true
ignore-errors: true
use-extractors: ["youtube"]
ytdlp_raw_args:
- '--extractor-args "youtube:formats=duplicate;jsc_trace=true;player_client=tv_simply;pot_trace=true;skip=translated_subs,hls"'
- '--extractor-args "youtubepot-bgutilhttp:base_url=http://172.17.0.1:4416"'
- '--sleep-requests 0.75'
# --retry-sleep linear=1::2'
# --- Live Error Parsing Rules ---
# These regex patterns are checked against yt-dlp's stderr in real-time.
# If a fatal error is detected, immediately ban the profile to stop the container
# and prevent further errors in the same batch.
ban_on_fatal_error_in_batch: true
fatal_error_patterns:
- "Sign in to confirm youre not a bot"
- "rate-limited by YouTube"
- "This content isn't available, try again later"
- "HTTP Error 502"
tolerated_error_patterns:
- "HTTP Error 429"
- "The uploader has not made this video available in your country"
- "This video has been removed by the uploader"
- "Private video"
- "This is a private video"
- "Video is private"
- "Video unavailable"
- "account associated with this video has been terminated"
- "members-only content"
- "Sign in to confirm your age"
# Template for renaming the final info.json.
rename_file_template: "{video_id}-{profile_name}-{proxy}.info.json"
# Settings for controlling the behavior of dummy/simulation modes.
# These values can be overridden at runtime with the --set flag.
dummy_simulation_settings:
# Timings for dummy auth simulation (per-URL delay in a batch)
auth_min_seconds: 0.1
auth_max_seconds: 0.5
auth_failure_rate: 0.0
auth_skipped_failure_rate: 0.0
# Timings for dummy download simulation (per-format download time)
download_min_seconds: 1.0
download_max_seconds: 3.0
download_failure_rate: 0.0
download_skipped_failure_rate: 0.0
queue_policy:
# Set to false to use legacy, unprefixed queue names (e.g., 'queue2_auth_inbox').
# Set to true (or omit) to use environment-prefixed names (e.g., 'sim_auth_queue2_auth_inbox').
use_env_prefix: false
# Queue to pull URLs from
input_queue: "queue2_auth_inbox"
# --- Download Handoff Configuration ---
# Set to 'true' for Queue-Auth -> Queue-Download workflow.
# Set to 'false' for Queue-Auth -> File-Download workflow.
create_download_tasks: false
# Queue to push download tasks to (if create_download_tasks is true)
download_task_queue: "queue2_dl_inbox"
# How many tasks a worker should pull from the queue at once.
# This will become the batch size for the docker run.
batch_size: 5
# If specified, create download tasks for these formats
# Can be "all", a specific format ID, or a list of format IDs
# Defaults to the formats in download_policy.formats
# Example: formats_to_download: "140-dashy,299-dashy"
# Example: formats_to_download: "all"
# Example: formats_to_download: ["140-dashy", "299-dashy"]
formats_to_download: "from_download_policy"
# Whether to report completion back to a queue. Always reported for auth.
report_completion: true
# Queue to report completion to
completion_queue: "queue2_auth_completed"
# Queue to report failures to
failure_queue: "queue2_auth_fail"
# Queue to report skipped tasks to
skipped_queue: "queue2_auth_skipped"
simulation_parameters:
auth_env: "sim_auth"
download_env: "sim_download"