yt-dlp-dags/policies/2_download_only_policies.yaml

# This file contains policies for testing only the download step from
# existing info.json files. No new info.json files are generated.

---
# Policy: Basic profile-aware download test.
# This policy reads info.json files from a directory, groups them by a profile
# name extracted from the filename, and downloads them using multiple workers.
# Each worker handles one or more profiles sequentially.
name: basic_profile_aware_download

settings:
  mode: download_only
  info_json_dir: "prefetched_info_jsons"
  # Regex to extract profile names from filenames like '...-VIDEOID-my_profile_name.json'.
  profile_extraction_regex: ".*-[a-zA-Z0-9_-]{11}-(.+)\\.json"

execution_control:
  run_until: { cycles: 1 }
  # 'auto' sets workers to the number of profiles, capped by auto_workers_max.
  workers: auto
  auto_workers_max: 8
  # This sleep applies between each file downloaded by a single profile.
  sleep_between_tasks: { min_seconds: 1, max_seconds: 2 }

download_policy:
  formats: "18,140,299/298/137/136/135/134/133"
  downloader: "aria2c"
  downloader_args: "aria2c:-x 4 -k 1M"
  extra_args: "--cleanup --output-dir /tmp/downloads"
  # This sleep applies between formats of a single video.
  sleep_between_formats: { min_seconds: 0, max_seconds: 0 }

---
# Policy: Continuous download from a folder (Pipeline Stage 2).
# This policy watches a directory for new info.json files and processes them
# as they appear. It is designed to work as the second stage of a pipeline,
# consuming files generated by a 'fetch_only' policy like 'tv_fetch_with_cooldown'.
name: continuous_watch_download

settings:
  mode: download_only
  info_json_dir: "live_info_jsons"
  directory_scan_mode: continuous
  mark_processed_files: true # Rename files to *.processed to avoid re-downloading.
  max_files_per_cycle: 50 # Process up to 50 new files each time it checks.
  sleep_if_no_new_files_seconds: 15

execution_control:
  # Note: For 'continuous' mode, a time-based run_until (e.g., {minutes: 120})
  # is more typical. {cycles: 1} will cause it to scan the directory once
  # for new files, process them, and then exit.
  run_until: { cycles: 1 }
  workers: 4 # Use a few workers to process files in parallel.
  sleep_between_tasks: { min_seconds: 0, max_seconds: 0 }

download_policy:
  formats: "18,140"
  extra_args: "--cleanup --output-dir /tmp/downloads"