# Policy: Queue-based Authentication Simulation # # This policy simulates a continuous stream of info.json fetch requests. It pulls # URLs from a Redis queue and processes them, acting as the first stage in a # two-stage simulation. The second stage (downloading) can be handled in one # of two ways, configured below: # # --- WORKFLOW 1: Queue-Auth -> File-Download --- # - This policy creates info.json files in a shared directory (`save_info_json_dir`). # - A separate download simulation (e.g., policy 11_direct_docker_download_simulation.yaml) # watches that directory, picks up the files, and performs the downloads. # - To enable: # - Set `create_download_tasks: false` # - Ensure `save_info_json_dir` points to a shared path. # # --- WORKFLOW 2: Queue-Auth -> Queue-Download --- # - This policy creates download *tasks* and pushes them to another Redis queue. # - A separate download simulation (e.g., policy 13_queue_download_simulation.yaml) # pulls tasks from that queue and performs the downloads. # - To enable: # - Set `create_download_tasks: true` # - Configure `download_task_queue` to the correct queue name. # - Use `download_task_granularity` to control if one task is created per-URL # or per-format. # name: 12_queue_auth_simulation settings: mode: fetch_only orchestration_mode: queue_auth profile_mode: from_pool_with_lock # For Queue-Auth -> File-Download workflow: Directory to save generated info.json files. # A file-based download worker (e.g., policy 11) will watch this directory. # This directory MUST be inside the docker_host_mount_path. # NOTE: This path is expected to be on an s3fs mount for cross-host communication. save_info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation" execution_control: # Automatically discover profile groups from Redis and create workers for them. # This avoids having to list each profile group (e.g., user31, user32) manually. worker_pool_discovery: # A glob-style pattern to find profile prefixes in Redis. profile_prefix_pattern: "user*" # Number of workers to assign to each discovered profile prefix group. workers_per_profile_group: 1 # How long a worker should pause if it cannot find an available profile to lock. worker_polling_interval_seconds: 1 # No sleep between tasks; throughput is controlled by yt-dlp performance and profile availability. info_json_generation_policy: # This setting tells the auth worker how many download tasks will be generated # per successful info.json. It is used to correctly increment the # 'pending_downloads' counter on the auth profile. # Can be an integer, or 'from_download_policy' to automatically count formats # from the 'download_policy.formats' setting in this same policy file. downloads_per_url: "from_download_policy" # (For Queue-Download workflow) Controls how download tasks are created. # # "per_format": (Default) Creates one download task for EACH format specified in 'formats_to_download'. # If `formats_to_download` is "140,299", two download tasks are created, and # the 'pending_downloads' counter is incremented by 2. # # "per_url": Creates a SINGLE download task for the entire URL. The 'formats_to_download' # string is passed to the download worker as the format selector, but 'pending_downloads' # is only incremented by 1 for the whole URL. # # --- Current Setting --- download_task_granularity: "per_format" # --- Alternative Setting (commented out) --- # download_task_granularity: "per_url" # profile_prefix is now defined per-pool in execution_control.worker_pools # However, for queue auth mode, we need a fallback prefix profile_prefix: "user" # This section is needed for the 'downloads_per_url: from_download_policy' setting. # It should mirror the formats being used by the download simulation. download_policy: formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy,140-dashy/140-dashy-0/140" direct_docker_cli_policy: # Which simulation environment's profiles to use for locking. use_profile_env: "auth" # If true, a worker will try to lock a different profile than the one it just used. avoid_immediate_profile_reuse: true # How long the worker should wait for a different profile before re-using the same one. avoid_reuse_max_wait_seconds: 5 # NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest # sessions is ~300 videos/hour (~1000 webpage/player requests per hour). # For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour). # The enforcer policy (e.g., 8_unified_simulation_enforcer.yaml) should be # configured to respect these limits via rotation and rest periods. # If true, extract the visitor_id from yt-dlp logs, save it per-profile, # and inject it into subsequent requests for that profile. #track_visitor_id: true # --- Docker Execution Settings --- docker_image_name: "ytops/yt-dlp" # Image to use for `docker run` docker_network_name: "airflow_proxynet" # IMPORTANT: This path on the HOST will be mounted into the container at `docker_container_mount_path`. docker_host_mount_path: "run/docker_mount" docker_container_mount_path: "/config" # The mount point inside the container # Host path for persisting cache data (e.g., cookies, sigfuncs) between runs. # NOTE: This path should be on a fast, local disk, NOT on s3fs. docker_host_cache_path: ".cache/queue_auth_simulation" # Path inside the container where the cache is mounted. Should match HOME/.cache docker_container_cache_path: "/config/.cache" # If true, create and use a persistent cookie jar per profile inside the cache dir. # use_cookies: true # --- User-Agent Generation --- # Template for generating User-Agent strings for new profiles. # The '{major_version}' will be replaced by a version string. user_agent_template: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{major_version}.0.0.0 Safari/537.36" # Range of Chrome major versions to use for the template. # See CHROME_MAJOR_VERSION_RANGE in yt-dlp's random_user_agent(): # https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/utils/networking.py user_agent_version_range: [137, 143] # A base config file can be used, with overrides applied from the policy. # The orchestrator will inject 'proxy', 'batch-file', and 'output' keys into the overrides. ytdlp_config_file: "cli.auth.config" ytdlp_config_overrides: skip-download: true write-info-json: true no-write-subs: true no-color: true ignore-errors: true use-extractors: ["youtube"] ytdlp_raw_args: - '--extractor-args "youtube:formats=duplicate;jsc_trace=true;player_client=tv_simply;pot_trace=true;skip=translated_subs,hls"' - '--extractor-args "youtubepot-bgutilhttp:base_url=http://172.17.0.1:4416"' - '--sleep-requests 0.75' # --retry-sleep linear=1::2' # --- Live Error Parsing Rules --- # These regex patterns are checked against yt-dlp's stderr in real-time. # If a fatal error is detected, immediately ban the profile to stop the container # and prevent further errors in the same batch. ban_on_fatal_error_in_batch: true fatal_error_patterns: - "Sign in to confirm you’re not a bot" - "rate-limited by YouTube" - "This content isn't available, try again later" - "HTTP Error 502" tolerated_error_patterns: - "HTTP Error 429" - "The uploader has not made this video available in your country" - "This video has been removed by the uploader" - "Private video" - "This is a private video" - "Video is private" - "Video unavailable" - "account associated with this video has been terminated" - "members-only content" - "Sign in to confirm your age" # Template for renaming the final info.json. rename_file_template: "{video_id}-{profile_name}-{proxy}.info.json" # Settings for controlling the behavior of dummy/simulation modes. # These values can be overridden at runtime with the --set flag. dummy_simulation_settings: # Timings for dummy auth simulation (per-URL delay in a batch) auth_min_seconds: 0.1 auth_max_seconds: 0.5 auth_failure_rate: 0.0 auth_skipped_failure_rate: 0.0 # Timings for dummy download simulation (per-format download time) download_min_seconds: 1.0 download_max_seconds: 3.0 download_failure_rate: 0.0 download_skipped_failure_rate: 0.0 queue_policy: # Set to false to use legacy, unprefixed queue names (e.g., 'queue2_auth_inbox'). # Set to true (or omit) to use environment-prefixed names (e.g., 'sim_auth_queue2_auth_inbox'). use_env_prefix: false # Queue to pull URLs from input_queue: "queue2_auth_inbox" # --- Download Handoff Configuration --- # Set to 'true' for Queue-Auth -> Queue-Download workflow. # Set to 'false' for Queue-Auth -> File-Download workflow. create_download_tasks: false # Queue to push download tasks to (if create_download_tasks is true) download_task_queue: "queue2_dl_inbox" # How many tasks a worker should pull from the queue at once. # This will become the batch size for the docker run. batch_size: 5 # If specified, create download tasks for these formats # Can be "all", a specific format ID, or a list of format IDs # Defaults to the formats in download_policy.formats # Example: formats_to_download: "140-dashy,299-dashy" # Example: formats_to_download: "all" # Example: formats_to_download: ["140-dashy", "299-dashy"] formats_to_download: "from_download_policy" # Whether to report completion back to a queue. Always reported for auth. report_completion: true # Queue to report completion to completion_queue: "queue2_auth_completed" # Queue to report failures to failure_queue: "queue2_auth_fail" # Queue to report skipped tasks to skipped_queue: "queue2_auth_skipped" simulation_parameters: auth_env: "sim_auth" download_env: "sim_download"