Add cleanup, ffmpeg, changes on prefixes per worker, load balance on enforccer, dummy updates mode

This commit is contained in:
aperez 2025-12-30 09:47:03 +03:00
parent 5479e8c8f8
commit efac6cf1fb
12 changed files with 1346 additions and 1717 deletions

View File

@ -1 +0,0 @@
PASS_TO_UNLOCK_host_vars_encrypted

View File

@ -1,44 +0,0 @@
Deploy with ansible from af-jump
```
ssh user@af-jump
cp cluster.dummy.yml cluster.stress.yml
vi cluster.stress.yml
./tools/generate-inventory.py cluster.stress.yml
ansible-playbook ansible/playbook-XXX -i ansible/inventory.stress.ini
playbook-base-system.yml
playbook-proxies.yml
playbook-stress-sync-code.yml
playbook-stress-install-deps.yml
playbook-stress-generate-env.yml
playbook-docker-services-setup.yml
```
Code updates only of ytops
```
#git pull or ./tools/sync-to-jump.sh
playbook-stress-sync-code.yml
```
Running
```
ssh user@af-green
cd /srv/airflow_master
./bin/build-yt-dlp-image
bin/ytops-client setup-profiles --policy policies/6_profile_setup_policy.yaml --cleanup-all
bin/ytops-client profile list --auth-env sim_auth --download-env sim_download --live --no-blink --show-proxy-activity
bin/ytops-client policy-enforcer --policy policies/8_unified_simulation_enforcer.yaml --live
bin/ytops-client stress-policy --policy policies/10_direct_docker_auth_simulation.yaml --verbose --set execution_control.workers=1 --set settings.urls_file=inputfiles/urls.rt3700.txt
bin/ytops-client stress-policy --policy policies/11_direct_docker_download_simulation.yaml --set execution_control.workers=1 --verbose
```

View File

@ -20,13 +20,30 @@ settings:
save_info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation" save_info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation"
execution_control: execution_control:
workers: 1 # Define worker pools, each tied to a specific profile prefix.
# The stress tool will launch the specified number of workers for each pool.
worker_pools:
- profile_prefix: "user1"
workers: 1
- profile_prefix: "user2"
workers: 1
# How long a worker should pause if it cannot find an available profile to lock. # How long a worker should pause if it cannot find an available profile to lock.
worker_polling_interval_seconds: 1 worker_polling_interval_seconds: 1
# No sleep between tasks; throughput is controlled by yt-dlp performance and profile availability. # No sleep between tasks; throughput is controlled by yt-dlp performance and profile availability.
info_json_generation_policy: info_json_generation_policy:
profile_prefix: "user1" # This setting tells the auth worker how many download tasks will be generated
# per successful info.json. It is used to correctly increment the
# 'pending_downloads' counter on the auth profile.
# Can be an integer, or 'from_download_policy' to automatically count formats
# from the 'download_policy.formats' setting in this same policy file.
downloads_per_url: "from_download_policy"
# profile_prefix is now defined per-pool in execution_control.worker_pools
# This section is needed for the 'downloads_per_url: from_download_policy' setting.
# It should mirror the formats being used by the download simulation.
download_policy:
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy,140-dashy/140-dashy-0/140"
direct_docker_cli_policy: direct_docker_cli_policy:
# Which simulation environment's profiles to use for locking. # Which simulation environment's profiles to use for locking.

View File

@ -14,23 +14,34 @@ settings:
# This directory should contain info.json files generated by an auth simulation, # This directory should contain info.json files generated by an auth simulation,
# like `10_direct_docker_auth_simulation`. # like `10_direct_docker_auth_simulation`.
# It MUST be inside the docker_host_mount_path. # It MUST be inside the docker_host_mount_path.
info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation" info_json_dir: "run/docker_mount/download_tasks"
#info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation"
# Regex to extract the profile name from a task filename. The first capture # Regex to extract the profile name from a task filename. The first capture
# group is used. This is crucial for the task-first locking strategy. # group is used. This is crucial for the task-first locking strategy.
# It looks for a component that starts with 'user' between two hyphens. # It looks for a component that starts with 'user' between two hyphens.
profile_extraction_regex: '^.+?-(user[^-]+)-' profile_extraction_regex: '^.+?-(user[^-]+)-'
execution_control: execution_control:
workers: 1 # Define worker pools, each tied to a specific profile prefix.
# The stress tool will launch the specified number of workers for each pool.
worker_pools:
- profile_prefix: "user1"
workers: 1
- profile_prefix: "user2"
workers: 1
# How long a worker should pause if it cannot find an available profile or task. # How long a worker should pause if it cannot find an available profile or task.
worker_polling_interval_seconds: 1 worker_polling_interval_seconds: 1
download_policy: download_policy:
profile_prefix: "user1" # profile_prefix is now defined per-pool in execution_control.worker_pools
# A comma-separated list of format IDs to download for each info.json. # A comma-separated list of format IDs to download for each info.json.
# This is used by the dummy mode simulation to test per-format downloads. # This is used by the dummy mode simulation to test per-format downloads.
# In non-dummy mode, the format selector in ytdlp_config_overrides is used. # In non-dummy mode, the format selector in ytdlp_config_overrides is used.
formats: "140-dashy,299-dashy" formats: "140-dashy,299-dashy"
# After a successful download, run ffprobe to generate a stream info JSON file.
run_ffprobe: true
# After a successful download, replace the media file with a zero-byte .empty file.
cleanup: true
# Default cooldown in seconds if not specified by the enforcer in Redis. # Default cooldown in seconds if not specified by the enforcer in Redis.
# The value from Redis (set via `unlock_cooldown_seconds` in the enforcer policy) # The value from Redis (set via `unlock_cooldown_seconds` in the enforcer policy)
# will always take precedence. This is a fallback. # will always take precedence. This is a fallback.

View File

@ -15,6 +15,9 @@ auth_profile_setup:
- prefix: "user1" - prefix: "user1"
proxy: "sslocal-rust-1092:1092" proxy: "sslocal-rust-1092:1092"
count: 4 count: 4
- prefix: "user2"
proxy: "sslocal-rust-1093:1093"
count: 4
# --- Profile setup for the DOWNLOAD simulation --- # --- Profile setup for the DOWNLOAD simulation ---
download_profile_setup: download_profile_setup:
@ -24,4 +27,6 @@ download_profile_setup:
- prefix: "user1" - prefix: "user1"
proxy: "sslocal-rust-1092:1092" proxy: "sslocal-rust-1092:1092"
count: 4 count: 4
- prefix: "user2"
proxy: "sslocal-rust-1093:1093"
count: 4

View File

@ -19,12 +19,16 @@ simulation_parameters:
# --- Policies for the Authentication Simulation --- # --- Policies for the Authentication Simulation ---
auth_policy_enforcer_config: auth_policy_enforcer_config:
# New setting for load balancing across profile groups.
# "round_robin": Cycle through available groups evenly (FIFO based on rest time).
# "least_loaded": Prioritize the group with the fewest pending downloads.
profile_selection_strategy: "least_loaded"
# Ban if 2 failures occur within a 1-minute window. # Ban if 2 failures occur within a 1-minute window.
#ban_on_failures: 2 #ban_on_failures: 2
#ban_on_failures_window_minutes: 1 #ban_on_failures_window_minutes: 1
# The standard rest policy is disabled, as rotation is handled by the profile group. # The standard rest policy is disabled, as rotation is handled by the profile group.
profile_prefix: "user1"
# New rate limit policy to enforce requests-per-hour limits. # New rate limit policy to enforce requests-per-hour limits.
# For guest sessions, the limit is ~300 videos/hour. # For guest sessions, the limit is ~300 videos/hour.
@ -44,7 +48,7 @@ auth_policy_enforcer_config:
# The enforcer will ensure that no more than `max_active_profiles` from this # The enforcer will ensure that no more than `max_active_profiles` from this
# group are in the ACTIVE state at any time. # group are in the ACTIVE state at any time.
profile_groups: profile_groups:
- name: "exclusive_auth_profiles" - name: "auth_user1"
prefix: "user1" prefix: "user1"
# Enforce that only 1 profile from this group can be active at a time. # Enforce that only 1 profile from this group can be active at a time.
max_active_profiles: 1 max_active_profiles: 1
@ -65,6 +69,14 @@ auth_policy_enforcer_config:
# Safety net: max time to wait for downloads before forcing rotation. # Safety net: max time to wait for downloads before forcing rotation.
# Should be aligned with info.json URL validity (e.g., 4 hours = 240 mins). # Should be aligned with info.json URL validity (e.g., 4 hours = 240 mins).
max_wait_for_downloads_minutes: 240 max_wait_for_downloads_minutes: 240
- name: "auth_user2"
prefix: "user2"
max_active_profiles: 1
rotate_after_requests: 25
rest_duration_minutes_on_rotation: 1
defer_activation_if_any_waiting: true
wait_download_finish_per_profile: true
max_wait_for_downloads_minutes: 240
# Time-based proxy rules are disabled as they are not needed for this setup. # Time-based proxy rules are disabled as they are not needed for this setup.
proxy_work_minutes: 0 proxy_work_minutes: 0
@ -89,26 +101,28 @@ auth_policy_enforcer_config:
unlock_cooldown_seconds: 1 unlock_cooldown_seconds: 1
# Cross-simulation synchronization # Cross-simulation synchronization
cross_simulation_sync: #cross_simulation_sync:
# Link auth profiles to download profiles (by name) # Link auth profiles to download profiles (by name)
# Both profiles should exist in their respective environments # Both profiles should exist in their respective environments
profile_links: #profile_links:
- auth: "user1" # - auth: "user1"
download: "user1" # download: "user1"
# - auth: "user2"
# download: "user2"
# Which states to synchronize # Which states to synchronize
#sync_states: #sync_states:
# - "RESTING" # Disabling to prevent deadlock when auth profile is waiting for downloads. # - "RESTING" # Disabling to prevent deadlock when auth profile is waiting for downloads.
# The download profile must remain active to process them. # The download profile must remain active to process them.
# - "BANNED" # - "BANNED"
# Whether to sync rotation (when auth is rotated due to rotate_after_requests) # Whether to sync rotation (when auth is rotated due to rotate_after_requests)
#sync_rotation: true #sync_rotation: true
# Whether download profile should be banned if auth is banned (even if download hasn't violated its own rules) # Whether download profile should be banned if auth is banned (even if download hasn't violated its own rules)
#enforce_auth_lead: true #enforce_auth_lead: true
# Ensures the same profile (e.g., user1_0) is active in both simulations. # Ensures the same profile (e.g., user1_0) is active in both simulations.
# This will activate the correct download profile and rest any others in its group. # This will activate the correct download profile and rest any others in its group.
sync_active_profile: true #sync_active_profile: true
# When an auth profile is waiting for downloads, ensure the matching download profile is active # When an auth profile is waiting for downloads, ensure the matching download profile is active
sync_waiting_downloads: true #sync_waiting_downloads: true
# --- Policies for the Download Simulation --- # --- Policies for the Download Simulation ---
download_policy_enforcer_config: download_policy_enforcer_config:
@ -117,7 +131,6 @@ download_policy_enforcer_config:
ban_on_failures_window_minutes: 1 ban_on_failures_window_minutes: 1
# Standard rest policy is disabled in favor of group rotation. # Standard rest policy is disabled in favor of group rotation.
profile_prefix: "user1"
# New rate limit policy to enforce requests-per-hour limits. # New rate limit policy to enforce requests-per-hour limits.
# For guest sessions, the limit is ~300 videos/hour. We set it slightly lower to be safe. # For guest sessions, the limit is ~300 videos/hour. We set it slightly lower to be safe.
@ -135,11 +148,16 @@ download_policy_enforcer_config:
# A group of profiles that are mutually exclusive. Only one will be active at a time. # A group of profiles that are mutually exclusive. Only one will be active at a time.
profile_groups: profile_groups:
- name: "exclusive_download_profiles" - name: "download_user1"
prefix: "user1" prefix: "user1"
rotate_after_requests: 25 rotate_after_requests: 25
rest_duration_minutes_on_rotation: 1 rest_duration_minutes_on_rotation: 1
max_active_profiles: 1 max_active_profiles: 4
- name: "download_user2"
prefix: "user2"
rotate_after_requests: 25
rest_duration_minutes_on_rotation: 1
max_active_profiles: 4
# Time-based proxy rules are disabled. # Time-based proxy rules are disabled.
proxy_work_minutes: 50 proxy_work_minutes: 50

View File

@ -3,6 +3,7 @@
CLI tool to enforce policies on profiles. CLI tool to enforce policies on profiles.
""" """
import argparse import argparse
import collections
import json import json
import logging import logging
import os import os
@ -67,7 +68,7 @@ class PolicyEnforcer:
self.enforce_profile_group_policies(getattr(args, 'profile_groups', []), all_profiles_map) self.enforce_profile_group_policies(getattr(args, 'profile_groups', []), all_profiles_map)
# Un-rest profiles. This also reads from and modifies the local `all_profiles_map`. # Un-rest profiles. This also reads from and modifies the local `all_profiles_map`.
self.enforce_unrest_policy(getattr(args, 'profile_groups', []), all_profiles_map) self.enforce_unrest_policy(getattr(args, 'profile_groups', []), all_profiles_map, args)
# --- Phase 3: Apply policies to individual active profiles --- # --- Phase 3: Apply policies to individual active profiles ---
# Use the now-updated snapshot to determine which profiles are active. # Use the now-updated snapshot to determine which profiles are active.
@ -148,7 +149,7 @@ class PolicyEnforcer:
return True # Indicates profile was rested return True # Indicates profile was rested
return False return False
def enforce_unrest_policy(self, profile_groups, all_profiles_map): def enforce_unrest_policy(self, profile_groups, all_profiles_map, args):
all_profiles_list = list(all_profiles_map.values()) all_profiles_list = list(all_profiles_map.values())
resting_profiles = [p for p in all_profiles_list if p['state'] == self.manager.STATE_RESTING] resting_profiles = [p for p in all_profiles_list if p['state'] == self.manager.STATE_RESTING]
cooldown_profiles = [p for p in all_profiles_list if p['state'] == self.manager.STATE_COOLDOWN] cooldown_profiles = [p for p in all_profiles_list if p['state'] == self.manager.STATE_COOLDOWN]
@ -158,10 +159,6 @@ class PolicyEnforcer:
if not profiles_to_check: if not profiles_to_check:
return return
# Sort profiles to check by their rest_until timestamp, then by name.
# This creates a deterministic FIFO queue for activation.
profiles_to_check.sort(key=lambda p: (p.get('rest_until', 0), p.get('name', '')))
# --- Group-aware unrest logic --- # --- Group-aware unrest logic ---
profile_to_group_map = {} profile_to_group_map = {}
group_to_profiles_map = {} group_to_profiles_map = {}
@ -194,6 +191,57 @@ class PolicyEnforcer:
live_active_counts[group_name] = count live_active_counts[group_name] = count
# --- End group logic setup --- # --- End group logic setup ---
# --- New Sorting Logic based on Profile Selection Strategy ---
strategy = getattr(args, 'profile_selection_strategy', 'round_robin')
if strategy == 'least_loaded' and profile_groups:
logger.debug("Applying 'least_loaded' profile selection strategy.")
# Separate profiles that are ready from those that are not
ready_profiles = [p for p in profiles_to_check if now >= p.get('rest_until', 0)]
not_ready_profiles = [p for p in profiles_to_check if now < p.get('rest_until', 0)]
# Group ready profiles by their group name
ready_by_group = collections.defaultdict(list)
for p in ready_profiles:
group_name = profile_to_group_map.get(p['name'])
if group_name:
ready_by_group[group_name].append(p)
# Calculate load for each group (sum of pending downloads of all profiles in the group)
group_load = {}
for group_name, profiles_in_group_names in group_to_profiles_map.items():
total_pending = sum(
all_profiles_map.get(p_name, {}).get('pending_downloads', 0)
for p_name in profiles_in_group_names
)
group_load[group_name] = total_pending
# Sort groups by load, then by name for stability
sorted_groups = sorted(group_load.items(), key=lambda item: (item[1], item[0]))
logger.debug(f"Group load order: {[(name, load) for name, load in sorted_groups]}")
# Rebuild the list of ready profiles, ordered by group load
sorted_ready_profiles = []
for group_name, load in sorted_groups:
profiles_in_group = ready_by_group.get(group_name, [])
# Within a group, sort by rest_until (FIFO)
profiles_in_group.sort(key=lambda p: (p.get('rest_until', 0), p.get('name', '')))
sorted_ready_profiles.extend(profiles_in_group)
# Add profiles not in any group to the end
profiles_not_in_group = [p for p in ready_profiles if not profile_to_group_map.get(p['name'])]
profiles_not_in_group.sort(key=lambda p: (p.get('rest_until', 0), p.get('name', '')))
sorted_ready_profiles.extend(profiles_not_in_group)
# The final list to check is the sorted ready profiles, followed by the not-ready ones.
not_ready_profiles.sort(key=lambda p: (p.get('rest_until', 0), p.get('name', '')))
profiles_to_check = sorted_ready_profiles + not_ready_profiles
else: # Default FIFO sort
if strategy not in ['round_robin']:
logger.warning(f"Unknown or unhandled profile_selection_strategy '{strategy}'. Defaulting to 'round_robin' (FIFO).")
profiles_to_check.sort(key=lambda p: (p.get('rest_until', 0), p.get('name', '')))
# --- End New Sorting Logic ---
# --- New logic: Identify groups with waiting profiles --- # --- New logic: Identify groups with waiting profiles ---
groups_with_waiting_profiles = {} groups_with_waiting_profiles = {}
if profile_groups: if profile_groups:
@ -1170,6 +1218,7 @@ def main_policy_enforcer(args):
'unlock_stale_locks_after_seconds': 120, 'unlock_stale_locks_after_seconds': 120,
'unlock_cooldown_seconds': 0, 'unlock_cooldown_seconds': 0,
'max_global_proxy_active_minutes': 0, 'rest_duration_on_max_active': 10, 'max_global_proxy_active_minutes': 0, 'rest_duration_on_max_active': 10,
'profile_selection_strategy': 'round_robin',
'interval_seconds': 60, 'proxy_groups': [], 'profile_groups': [] 'interval_seconds': 60, 'proxy_groups': [], 'profile_groups': []
} }

View File

@ -290,17 +290,26 @@ class ProfileManager:
if not profile_names: if not profile_names:
return [] return []
# Use a pipeline to fetch all profile data at once for efficiency # --- Batch fetch profile data to avoid timeouts ---
pipe = self.redis.pipeline() all_profile_data = []
for name in profile_names: all_pending_downloads = []
pipe.hgetall(self._profile_key(name)) batch_size = 500
all_profile_data = pipe.execute()
for i in range(0, len(profile_names), batch_size):
# Also fetch pending download counts for all profiles batch_names = profile_names[i:i + batch_size]
pipe = self.redis.pipeline()
for name in profile_names: # Fetch profile hashes
pipe.get(self._pending_downloads_key(name)) pipe = self.redis.pipeline()
all_pending_downloads = pipe.execute() for name in batch_names:
pipe.hgetall(self._profile_key(name))
all_profile_data.extend(pipe.execute())
# Fetch pending download counts
pipe = self.redis.pipeline()
for name in batch_names:
pipe.get(self._pending_downloads_key(name))
all_pending_downloads.extend(pipe.execute())
# --- End batch fetch ---
numeric_fields = ['created_at', 'last_used', 'success_count', 'failure_count', numeric_fields = ['created_at', 'last_used', 'success_count', 'failure_count',
'tolerated_error_count', 'download_count', 'download_error_count', 'tolerated_error_count', 'download_count', 'download_error_count',
@ -667,27 +676,31 @@ class ProfileManager:
if not proxy_urls: if not proxy_urls:
return {} return {}
pipe = self.redis.pipeline()
for proxy_url in proxy_urls:
pipe.hgetall(self._proxy_state_key(proxy_url))
results = pipe.execute()
states = {} states = {}
for i, data in enumerate(results): batch_size = 500
proxy_url = proxy_urls[i]
if data: for i in range(0, len(proxy_urls), batch_size):
# Convert numeric fields batch_urls = proxy_urls[i:i + batch_size]
for field in ['rest_until', 'work_start_timestamp']:
if field in data: pipe = self.redis.pipeline()
try: for proxy_url in batch_urls:
data[field] = float(data[field]) pipe.hgetall(self._proxy_state_key(proxy_url))
except (ValueError, TypeError): results = pipe.execute()
data[field] = 0.0
states[proxy_url] = data for j, data in enumerate(results):
else: proxy_url = batch_urls[j]
# Default to ACTIVE if no state is found if data:
states[proxy_url] = {'state': self.STATE_ACTIVE, 'rest_until': 0.0, 'work_start_timestamp': 0.0} # Convert numeric fields
for field in ['rest_until', 'work_start_timestamp']:
if field in data:
try:
data[field] = float(data[field])
except (ValueError, TypeError):
data[field] = 0.0
states[proxy_url] = data
else:
# Default to ACTIVE if no state is found
states[proxy_url] = {'state': self.STATE_ACTIVE, 'rest_until': 0.0, 'work_start_timestamp': 0.0}
return states return states

View File

@ -129,7 +129,8 @@ Overridable Policy Parameters via --set:
download_policy.proxy_rename Regex substitution for the proxy URL (e.g., 's/old/new/'). download_policy.proxy_rename Regex substitution for the proxy URL (e.g., 's/old/new/').
download_policy.pause_before_download_seconds Pause for N seconds before starting each download attempt. download_policy.pause_before_download_seconds Pause for N seconds before starting each download attempt.
download_policy.continue_downloads Enable download continuation (true/false). download_policy.continue_downloads Enable download continuation (true/false).
download_policy.cleanup After success: for native downloaders, rename and truncate file to 0 bytes; for 'aria2c_rpc', remove file(s) from filesystem. download_policy.cleanup After success, replace downloaded media file with a zero-byte '.empty' file.
download_policy.run_ffprobe After success, run ffprobe on the media file and save stream info to a .ffprobe.json file.
download_policy.extra_args A string of extra arguments for the download script (e.g., "--limit-rate 5M"). download_policy.extra_args A string of extra arguments for the download script (e.g., "--limit-rate 5M").
download_policy.sleep_per_proxy_seconds Cooldown in seconds between downloads on the same proxy. download_policy.sleep_per_proxy_seconds Cooldown in seconds between downloads on the same proxy.
download_policy.rate_limits.per_proxy.max_requests Max downloads for a single proxy in a time period. download_policy.rate_limits.per_proxy.max_requests Max downloads for a single proxy in a time period.
@ -195,6 +196,9 @@ Overridable Policy Parameters via --set:
'If a path is provided, cleans that directory. ' 'If a path is provided, cleans that directory. '
'If used without a path, cleans the directory specified in download_policy.output_dir or direct_docker_cli_policy.docker_host_download_path. ' 'If used without a path, cleans the directory specified in download_policy.output_dir or direct_docker_cli_policy.docker_host_download_path. '
'If no output_dir is set, it fails.') 'If no output_dir is set, it fails.')
download_util_group.add_argument('--run-ffprobe', action=argparse.BooleanOptionalAction, default=None,
help='After a successful download, run ffprobe to generate a stream info JSON file. '
'Overrides download_policy.run_ffprobe.')
download_util_group.add_argument('--reset-local-cache-folder', nargs='?', const='.', default=None, download_util_group.add_argument('--reset-local-cache-folder', nargs='?', const='.', default=None,
help="Before running, delete the contents of the local cache folder used by direct_docker_cli mode. " help="Before running, delete the contents of the local cache folder used by direct_docker_cli mode. "
"The cache folder is defined by 'direct_docker_cli_policy.docker_host_cache_path' in the policy. " "The cache folder is defined by 'direct_docker_cli_policy.docker_host_cache_path' in the policy. "

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -56,7 +56,6 @@ def add_task_generator_parser(subparsers):
gen_parser.add_argument('--formats', required=True, help='A comma-separated list of format IDs or selectors to generate tasks for (e.g., "18,140,bestvideo").') gen_parser.add_argument('--formats', required=True, help='A comma-separated list of format IDs or selectors to generate tasks for (e.g., "18,140,bestvideo").')
gen_parser.add_argument('--live', action='store_true', help='Run continuously, watching the source directory for new files.') gen_parser.add_argument('--live', action='store_true', help='Run continuously, watching the source directory for new files.')
gen_parser.add_argument('--interval-seconds', type=int, default=10, help='When in --live mode, how often to scan for new files.') gen_parser.add_argument('--interval-seconds', type=int, default=10, help='When in --live mode, how often to scan for new files.')
gen_parser.add_argument('--dummy', action='store_true', help='Generate dummy task files without reading info.json content. Useful for testing download workers.')
gen_parser.add_argument('--verbose', action='store_true', help='Enable verbose logging.') gen_parser.add_argument('--verbose', action='store_true', help='Enable verbose logging.')
reset_parser = generate_subparsers.add_parser( reset_parser = generate_subparsers.add_parser(
@ -124,29 +123,11 @@ def main_task_generator(args):
return 1 return 1
def _generate_tasks_for_file(source_file, output_dir, formats_to_generate, is_dummy_mode): def _generate_tasks_for_file(source_file, output_dir, formats_to_generate):
"""Helper function to generate task files for a single source info.json.""" """Helper function to generate task files for a single source info.json."""
try: try:
info_json_content = {} with open(source_file, 'r', encoding='utf-8') as f:
if is_dummy_mode: info_json_content = json.load(f)
# In dummy mode, we don't read the file content. We create a minimal structure.
# We try to parse the filename to get video_id and profile_name for organization.
# Example filename: {video_id}-{profile_name}-{proxy}.info.json
parts = source_file.stem.split('-')
video_id = parts[0] if parts else 'dummy_video'
profile_name = next((p for p in parts if p.startswith('user')), None)
info_json_content = {
'id': video_id,
'_dummy': True,
'_ytops_metadata': {
'profile_name': profile_name
}
}
logger.debug(f"DUMMY MODE: Generating tasks for source file: {source_file.name}")
else:
with open(source_file, 'r', encoding='utf-8') as f:
info_json_content = json.load(f)
except (IOError, json.JSONDecodeError) as e: except (IOError, json.JSONDecodeError) as e:
logger.warning(f"Skipping file '{source_file.name}' due to read/parse error: {e}") logger.warning(f"Skipping file '{source_file.name}' due to read/parse error: {e}")
return 0 return 0
@ -237,7 +218,7 @@ def _main_task_generator_generate(args):
total_tasks_generated = 0 total_tasks_generated = 0
for source_file in source_files: for source_file in source_files:
tasks_for_file = _generate_tasks_for_file(source_file, output_dir, formats_to_generate, args.dummy) tasks_for_file = _generate_tasks_for_file(source_file, output_dir, formats_to_generate)
total_tasks_generated += tasks_for_file total_tasks_generated += tasks_for_file
logger.info(f"Successfully generated {total_tasks_generated} new task file(s) in '{output_dir}'.") logger.info(f"Successfully generated {total_tasks_generated} new task file(s) in '{output_dir}'.")
@ -258,7 +239,7 @@ def _main_task_generator_generate(args):
logger.info(f"Live mode: Found {len(source_files)} source file(s) to process.") logger.info(f"Live mode: Found {len(source_files)} source file(s) to process.")
for source_file in source_files: for source_file in source_files:
if shutdown_event: break if shutdown_event: break
tasks_for_file = _generate_tasks_for_file(source_file, output_dir, formats_to_generate, args.dummy) tasks_for_file = _generate_tasks_for_file(source_file, output_dir, formats_to_generate)
total_tasks_generated += tasks_for_file total_tasks_generated += tasks_for_file
if shutdown_event: break if shutdown_event: break