yt-dlp-dags/ytops_client-source/policies/10_direct_docker_auth_simulation.yaml

# Policy: Continuous Authentication Simulation via Direct Docker Exec
#
# This policy simulates a continuous stream of info.json fetch requests using
# the 'direct_docker_cli' mode. It calls a yt-dlp command inside a running
# Docker container, passing in a batch file and configuration.
#
# It uses a pool of managed profiles, locking one for each BATCH of requests.
# The host orchestrator prepares files, and docker exec runs yt-dlp. The container
# itself does not need to be Redis-aware.
#
name: direct_docker_auth_simulation

settings:
  mode: fetch_only
  orchestration_mode: direct_docker_cli
  profile_mode: from_pool_with_lock
  urls_file: "inputfiles/urls.rt300.txt"
  # The save directory MUST be inside the docker_host_mount_path for the download
  # simulation to be able to find the files.
  # NOTE: This path is expected to be on an s3fs mount for cross-host communication.
  save_info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation"

  # Settings for controlling the behavior of dummy/simulation modes.
  # These values can be overridden at runtime with the --set flag.
  dummy_simulation_settings:
    # Timings for dummy auth simulation (per-URL delay in a batch)
    auth_min_seconds: 0.1
    auth_max_seconds: 0.5
    auth_failure_rate: 0.0
    auth_skipped_failure_rate: 0.0
    # Timings for dummy download simulation (per-format download time)
    download_min_seconds: 1.0
    download_max_seconds: 3.0
    download_failure_rate: 0.0
    download_skipped_failure_rate: 0.0

execution_control:
  # Automatically discover profile groups from Redis and create workers for them.
  # This avoids having to list each profile group (e.g., user31, user32) manually.
  worker_pool_discovery:
    # A glob-style pattern to find profile prefixes in Redis.
    # 'user*' will match all profiles like 'user31_001', 'user61_002', etc.,
    # and the tool will create worker pools grouped by 'user31', 'user61', etc.
    profile_prefix_pattern: "user*"
    # Number of workers to assign to each discovered profile prefix group.
    workers_per_profile_group: 1
  # How long a worker should pause if it cannot find an available profile to lock.
  worker_polling_interval_seconds: 1
  # No sleep between tasks; throughput is controlled by yt-dlp performance and profile availability.

info_json_generation_policy:
  # This setting tells the auth worker how many download tasks will be generated
  # per successful info.json. It is used to correctly increment the
  # 'pending_downloads' counter on the auth profile.
  # Can be an integer, or 'from_download_policy' to automatically count formats
  # from the 'download_policy.formats' setting in this same policy file.
  downloads_per_url: "from_download_policy"
  # profile_prefix is now defined per-pool in execution_control.worker_pools

# This section is needed for the 'downloads_per_url: from_download_policy' setting.
# It should mirror the formats being used by the download simulation.
download_policy:
  formats: "299-dashy" #/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy,140-dashy/140-dashy-0/140"

direct_docker_cli_policy:
  # Which simulation environment's profiles to use for locking.
  use_profile_env: "auth"

  # If true, a worker will try to lock a different profile than the one it just used.
  avoid_immediate_profile_reuse: true
  # How long the worker should wait for a different profile before re-using the same one.
  avoid_reuse_max_wait_seconds: 5

  # NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
  # sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
  # For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
  # The enforcer policy (e.g., 8_unified_simulation_enforcer.yaml) should be
  # configured to respect these limits via rotation and rest periods.

  # If true, extract the visitor_id from yt-dlp logs, save it per-profile,
  # and inject it into subsequent requests for that profile.
  #track_visitor_id: true

  # --- Docker Execution Settings ---
  docker_image_name: "ytops/yt-dlp" # Image to use for `docker run`
  docker_network_name: "airflow_proxynet"
  # IMPORTANT: This path on the HOST will be mounted into the container at `docker_container_mount_path`.
  docker_host_mount_path: "run/docker_mount"
  docker_container_mount_path: "/config" # The mount point inside the container

  # Host path for persisting cache data (e.g., cookies, sigfuncs) between runs.
  # NOTE: This path should be on a fast, local disk, NOT on s3fs.
  docker_host_cache_path: ".cache/direct_docker_simulation"
  # Path inside the container where the cache is mounted. Should match HOME/.cache
  docker_container_cache_path: "/config/.cache"

  # If true, create and use a persistent cookie jar per profile inside the cache dir.
  # use_cookies: true

  # --- User-Agent Generation ---
  # Template for generating User-Agent strings for new profiles.
  # The '{major_version}' will be replaced by a version string.
  user_agent_template: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{major_version}.0.0.0 Safari/537.36"
  # Range of Chrome major versions to use for the template.
  # See CHROME_MAJOR_VERSION_RANGE in yt-dlp's random_user_agent():
  # https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/utils/networking.py
  user_agent_version_range: [137, 143]

  batch_size: 5

  # A base config file can be used, with overrides applied from the policy.
  # The orchestrator will inject 'proxy', 'batch-file', and 'output' keys into the overrides.
  ytdlp_config_file: "cli.auth.config"
  ytdlp_config_overrides:
    skip-download: true
    write-info-json: true
    no-write-subs: true
    no-color: true
    ignore-errors: true
    use-extractors: ["youtube"]

  ytdlp_raw_args:
    - '--extractor-args "youtube:formats=duplicate;jsc_trace=true;player_client=tv_simply;pot_trace=true;skip=translated_subs,hls"'
    - '--extractor-args "youtubepot-bgutilhttp:base_url=http://172.17.0.1:4416"'
    - '--sleep-requests 0.75'
      # --retry-sleep linear=1::2'

  # --- Live Error Parsing Rules ---
  # These regex patterns are checked against yt-dlp's stderr in real-time.
  # If a fatal error is detected, immediately ban the profile to stop the container
  # and prevent further errors in the same batch.
  ban_on_fatal_error_in_batch: true
  fatal_error_patterns:
    - "Sign in to confirm you’re not a bot"
    - "rate-limited by YouTube"
    - "This content isn't available, try again later"
    - "HTTP Error 502"

  tolerated_error_patterns:
    - "HTTP Error 429"
    - "The uploader has not made this video available in your country"
    - "This video has been removed by the uploader"
    - "Private video"
    - "This is a private video"
    - "Video is private"
    - "Video unavailable"
    - "account associated with this video has been terminated"
    - "members-only content"
    - "Sign in to confirm your age"

  # Template for renaming the final info.json.
  rename_file_template: "{video_id}-{profile_name}-{proxy}.info.json"

simulation_parameters:
  auth_env: "sim_auth"
  download_env: "sim_download"