diff --git a/ytops_client-source/policies/10_direct_docker_auth_simulation.yaml b/ytops_client-source/policies/10_direct_docker_auth_simulation.yaml index 588b97a..3cd6848 100644 --- a/ytops_client-source/policies/10_direct_docker_auth_simulation.yaml +++ b/ytops_client-source/policies/10_direct_docker_auth_simulation.yaml @@ -14,19 +14,35 @@ settings: mode: fetch_only orchestration_mode: direct_docker_cli profile_mode: from_pool_with_lock - urls_file: "inputfiles/urls.sky3.txt" + urls_file: "inputfiles/urls.rt300.txt" # The save directory MUST be inside the docker_host_mount_path for the download # simulation to be able to find the files. save_info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation" + # Settings for controlling the behavior of dummy/simulation modes. + # These values can be overridden at runtime with the --set flag. + dummy_simulation_settings: + # Timings for dummy auth simulation (per-URL delay in a batch) + auth_min_seconds: 0.1 + auth_max_seconds: 0.5 + auth_failure_rate: 0.0 + auth_skipped_failure_rate: 0.0 + # Timings for dummy download simulation (per-format download time) + download_min_seconds: 1.0 + download_max_seconds: 3.0 + download_failure_rate: 0.0 + download_skipped_failure_rate: 0.0 + execution_control: - # Define worker pools, each tied to a specific profile prefix. - # The stress tool will launch the specified number of workers for each pool. + # Define worker pools. For a single auth worker that serves multiple groups + # (e.g., user1, user2), a single pool with a broad prefix like "user" is + # correct. This allows the worker to lock whichever profile the enforcer + # makes available from any group. worker_pools: - - profile_prefix: "user1" - workers: 1 - - profile_prefix: "user2" + - profile_prefix: "user" workers: 1 + # - profile_prefix: "user2" + # workers: 1 # How long a worker should pause if it cannot find an available profile to lock. worker_polling_interval_seconds: 1 # No sleep between tasks; throughput is controlled by yt-dlp performance and profile availability. @@ -43,7 +59,7 @@ info_json_generation_policy: # This section is needed for the 'downloads_per_url: from_download_policy' setting. # It should mirror the formats being used by the download simulation. download_policy: - formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy,140-dashy/140-dashy-0/140" + formats: "299-dashy" #/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy,140-dashy/140-dashy-0/140" direct_docker_cli_policy: # Which simulation environment's profiles to use for locking. @@ -88,7 +104,7 @@ direct_docker_cli_policy: # https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/utils/networking.py user_agent_version_range: [137, 143] - batch_size: 25 + batch_size: 5 # A base config file can be used, with overrides applied from the policy. # The orchestrator will inject 'proxy', 'batch-file', and 'output' keys into the overrides. diff --git a/ytops_client-source/policies/11_direct_docker_download_simulation.yaml b/ytops_client-source/policies/11_direct_docker_download_simulation.yaml index 1f060bb..db5665e 100644 --- a/ytops_client-source/policies/11_direct_docker_download_simulation.yaml +++ b/ytops_client-source/policies/11_direct_docker_download_simulation.yaml @@ -8,14 +8,15 @@ name: direct_docker_download_simulation settings: + #dummy_batch: true mode: download_only orchestration_mode: direct_docker_cli profile_mode: from_pool_with_lock # This directory should contain info.json files generated by an auth simulation, # like `10_direct_docker_auth_simulation`. # It MUST be inside the docker_host_mount_path. - info_json_dir: "run/docker_mount/download_tasks" - #info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation" + info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation" + #info_json_dir: "run/docker_mount/download_tasks" # Regex to extract the profile name from a task filename. The first capture # group is used. This is crucial for the task-first locking strategy. # It looks for a component that starts with 'user' between two hyphens. @@ -37,11 +38,11 @@ download_policy: # A comma-separated list of format IDs to download for each info.json. # This is used by the dummy mode simulation to test per-format downloads. # In non-dummy mode, the format selector in ytdlp_config_overrides is used. - formats: "140-dashy,299-dashy" + #formats: "140-dashy,299-dashy" # After a successful download, run ffprobe to generate a stream info JSON file. - run_ffprobe: true + #run_ffprobe: true # After a successful download, replace the media file with a zero-byte .empty file. - cleanup: true + #cleanup: true # Default cooldown in seconds if not specified by the enforcer in Redis. # The value from Redis (set via `unlock_cooldown_seconds` in the enforcer policy) # will always take precedence. This is a fallback. @@ -99,7 +100,8 @@ direct_docker_cli_policy: buffer-size: "4M" concurrent-fragments: 8 - ytdlp_raw_args: [] + ytdlp_raw_args: # [] + - "--simulate" # --- Live Error Parsing Rules --- # If a fatal error is detected, immediately ban the profile to stop the container. diff --git a/ytops_client-source/policies/6_profile_setup_policy.yaml b/ytops_client-source/policies/6_profile_setup_policy.yaml index ee2bef7..03320db 100644 --- a/ytops_client-source/policies/6_profile_setup_policy.yaml +++ b/ytops_client-source/policies/6_profile_setup_policy.yaml @@ -14,10 +14,13 @@ auth_profile_setup: pools: - prefix: "user1" proxy: "sslocal-rust-1092:1092" - count: 4 + count: 3 - prefix: "user2" - proxy: "sslocal-rust-1093:1093" - count: 4 + proxy: "sslocal-rust-1092:1092" + count: 3 + - prefix: "user3" + proxy: "sslocal-rust-1092:1092" + count: 3 # --- Profile setup for the DOWNLOAD simulation --- download_profile_setup: @@ -26,7 +29,10 @@ download_profile_setup: pools: - prefix: "user1" proxy: "sslocal-rust-1092:1092" - count: 4 + count: 3 - prefix: "user2" - proxy: "sslocal-rust-1093:1093" - count: 4 + proxy: "sslocal-rust-1092:1092" + count: 3 + - prefix: "user3" + proxy: "sslocal-rust-1092:1092" + count: 3 diff --git a/ytops_client-source/policies/8_unified_simulation_enforcer.yaml b/ytops_client-source/policies/8_unified_simulation_enforcer.yaml index c06c27a..3c20308 100644 --- a/ytops_client-source/policies/8_unified_simulation_enforcer.yaml +++ b/ytops_client-source/policies/8_unified_simulation_enforcer.yaml @@ -2,10 +2,6 @@ # This file is used by `bin/ytops-client policy-enforcer --live` to manage # both the authentication and download simulation environments from a single process. -# Policy for the unified simulation enforcer. -# This file is used by `bin/ytops-client policy-enforcer --live` to manage -# both the authentication and download simulation environments from a single process. - simulation_parameters: # --- Common Redis settings for all tools --- # The enforcer will connect to two different Redis environments (key prefixes) @@ -19,10 +15,6 @@ simulation_parameters: # --- Policies for the Authentication Simulation --- auth_policy_enforcer_config: - # New setting for load balancing across profile groups. - # "round_robin": Cycle through available groups evenly (FIFO based on rest time). - # "least_loaded": Prioritize the group with the fewest pending downloads. - profile_selection_strategy: "least_loaded" # Ban if 2 failures occur within a 1-minute window. #ban_on_failures: 2 @@ -32,7 +24,7 @@ auth_policy_enforcer_config: # New rate limit policy to enforce requests-per-hour limits. # For guest sessions, the limit is ~300 videos/hour. - rate_limit_requests: 280 + rate_limit_requests: 0 rate_limit_window_minutes: 60 rate_limit_rest_duration_minutes: 5 @@ -44,41 +36,45 @@ auth_policy_enforcer_config: # For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour). # The settings below should be configured to respect these limits. - # A group of profiles that are managed together. - # The enforcer will ensure that no more than `max_active_profiles` from this - # group are in the ACTIVE state at any time. + # New setting for load balancing across profile groups. + # "longest_idle": Activates the profile that has been idle the longest across all groups (based on last_used time). + # This is a global FIFO strategy that effectively cycles through profiles regardless of their group. + # "least_loaded": Prioritizes activating a profile from the group with the fewest pending downloads. + # If multiple groups have zero pending downloads, it acts as a FIFO queue, activating + # the one that finished its last download batch the earliest. This is useful when you want + # to ensure a group finishes its entire workload before another group starts. + profile_selection_strategy: "longest_idle" + + # Enforce a total limit of active profiles across all groups defined below. + # Set to 1 to ensure only one group's profile is active at any time. + global_max_active_profiles: 1 + + # Define separate profile groups for each user type. + # This allows one profile from each group to be active simultaneously, + # ensuring the single auth worker is never blocked waiting for downloads. profile_groups: - name: "auth_user1" prefix: "user1" - # Enforce that only 1 profile from this group can be active at a time. max_active_profiles: 1 - # After an active profile has been used for this many requests, it will be - # rotated out and put into a RESTING state. - rotate_after_requests: 25 - # How long a profile rests after being rotated out. - rest_duration_minutes_on_rotation: 1 - - # If true, no new profile in this group will be activated while another - # one is in the 'waiting_downloads' state. - defer_activation_if_any_waiting: true - - # --- New settings for download wait feature --- - # When a profile is rotated, wait for its generated downloads to finish - # before it can be used again. - wait_download_finish_per_profile: true - # Safety net: max time to wait for downloads before forcing rotation. - # Should be aligned with info.json URL validity (e.g., 4 hours = 240 mins). + rotate_after_requests: 5 + rest_duration_minutes_on_rotation: 0.20 + wait_download_finish_per_group: true max_wait_for_downloads_minutes: 240 - name: "auth_user2" prefix: "user2" max_active_profiles: 1 - rotate_after_requests: 25 - rest_duration_minutes_on_rotation: 1 - defer_activation_if_any_waiting: true - wait_download_finish_per_profile: true + rotate_after_requests: 5 + rest_duration_minutes_on_rotation: 0.20 + wait_download_finish_per_group: true + max_wait_for_downloads_minutes: 240 + - name: "auth_user3" + prefix: "user3" + max_active_profiles: 1 + rotate_after_requests: 5 + rest_duration_minutes_on_rotation: 0.20 + wait_download_finish_per_group: true max_wait_for_downloads_minutes: 240 - # Time-based proxy rules are disabled as they are not needed for this setup. proxy_work_minutes: 0 proxy_rest_duration_minutes: 0 @@ -98,31 +94,41 @@ auth_policy_enforcer_config: # A short post-task cooldown for auth simulation profiles. When a batch is finished, # the profile is put into COOLDOWN briefly. This prevents a worker from immediately # re-locking the same profile, giving the policy enforcer a window to perform rotation. - unlock_cooldown_seconds: 1 + unlock_cooldown_seconds: 0 -# Cross-simulation synchronization -#cross_simulation_sync: - # Link auth profiles to download profiles (by name) - # Both profiles should exist in their respective environments - #profile_links: - # - auth: "user1" - # download: "user1" - # - auth: "user2" - # download: "user2" - # Which states to synchronize - #sync_states: - # - "RESTING" # Disabling to prevent deadlock when auth profile is waiting for downloads. - # The download profile must remain active to process them. - # - "BANNED" - # Whether to sync rotation (when auth is rotated due to rotate_after_requests) - #sync_rotation: true - # Whether download profile should be banned if auth is banned (even if download hasn't violated its own rules) - #enforce_auth_lead: true - # Ensures the same profile (e.g., user1_0) is active in both simulations. - # This will activate the correct download profile and rest any others in its group. - #sync_active_profile: true - # When an auth profile is waiting for downloads, ensure the matching download profile is active - #sync_waiting_downloads: true +# --- Cross-simulation synchronization --- +cross_simulation_sync: + # Link auth profiles to download profiles (by prefix) + profile_links: + - auth: "user1" + download: "user1" + - auth: "user2" + download: "user2" + - auth: "user3" + download: "user3" + + # Which states to synchronize from auth to download. + # 'RESTING' is no longer needed here; the new group-aware deactivation logic + # in `sync_active_profile` handles rotation more cleanly. + sync_states: + - "BANNED" + + # If true, when an auth profile is rotated, the corresponding + # download profile group will also be rotated. This is now handled by the + # group-aware deactivation logic triggered by `sync_active_profile`. + sync_rotation: true + + # If true, a BANNED state on an auth profile will force the download profile + # to also be BANNED. + enforce_auth_lead: true + + # CRITICAL: Ensures the correct download profile GROUP is active. + # This will activate the target download profile and rest any profiles in other groups. + sync_active_profile: true + + # When an auth profile is in the 'waiting_downloads' state, ensure the + # matching download profile is active so it can process those downloads. + sync_waiting_downloads: true # --- Policies for the Download Simulation --- download_policy_enforcer_config: @@ -137,7 +143,6 @@ download_policy_enforcer_config: rate_limit_requests: 280 rate_limit_window_minutes: 60 rate_limit_rest_duration_minutes: 5 - # rest_after_requests: 0 rest_duration_minutes: 20 @@ -146,21 +151,27 @@ download_policy_enforcer_config: # For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour). # The settings below should be configured to respect these limits. - # A group of profiles that are mutually exclusive. Only one will be active at a time. + # Define separate profile groups for download workers. + # Increase max_active_profiles to allow all profiles in a group to be used. profile_groups: - name: "download_user1" prefix: "user1" - rotate_after_requests: 25 - rest_duration_minutes_on_rotation: 1 - max_active_profiles: 4 + rotate_after_requests: 0 + rest_duration_minutes_on_rotation: 0.2 + # max_active_profiles: 0 # Allow all profiles in this group to be active (0, -1, or omitted) - name: "download_user2" prefix: "user2" - rotate_after_requests: 25 - rest_duration_minutes_on_rotation: 1 - max_active_profiles: 4 + rotate_after_requests: 0 + rest_duration_minutes_on_rotation: 0.2 + # max_active_profiles: 0 # Allow all profiles in this group to be active (0, -1, or omitted) + - name: "download_user3" + prefix: "user3" + rotate_after_requests: 0 + rest_duration_minutes_on_rotation: 0.2 + # max_active_profiles: 0 # Allow all profiles in this group to be active (0, -1, or omitted) # Time-based proxy rules are disabled. - proxy_work_minutes: 50 + proxy_work_minutes: 0 proxy_rest_duration_minutes: 10 # Global maximum time a proxy can be active before being rested, regardless of @@ -177,5 +188,5 @@ download_policy_enforcer_config: unlock_stale_locks_after_seconds: 960 # After a profile is used for a download, unlock it but put it in COOLDOWN - # state for 12-16s. This is enforced by the worker, which reads this config from Redis. + # state for 2-3s. This is enforced by the worker, which reads this config from Redis. unlock_cooldown_seconds: [2, 3] diff --git a/ytops_client-source/ytdlp.json b/ytops_client-source/ytdlp.json index 414e5a6..0967ef4 100644 --- a/ytops_client-source/ytdlp.json +++ b/ytops_client-source/ytdlp.json @@ -1,59 +1 @@ -{ - - "ytops": { - "force_renew": [], - "session_params": { - "visitor_rotation_threshold": 0 - } - }, - - "ytdlp_params": { - "debug_printtraffic": true, - "write_pages": false, - "verbose": true, - "no_color": true, - "ignoreerrors": true, - "noresizebuffer": true, - "buffersize": "4M", - "concurrent_fragments": 8, - "socket_timeout": 60, - "outtmpl": { - "default": "%(id)s.f%(format_id)s.%(ext)s" - }, - "restrictfilenames": true, - "updatetime": false, - "noplaylist": true, - "match_filter": "!is_live", - "writeinfojson": true, - "skip_download": true, - "allow_playlist_files": false, - "clean_infojson": true, - "getcomments": false, - "writesubtitles": false, - "writethumbnail": false, - "sleep_interval_requests": 0.75, - "parse_metadata": [ - ":(?P)" - ], - "extractor_args": { - "youtube": { - "player_client": ["tv_simply"], - "formats": ["duplicate"], - "jsc_trace": ["true"], - "pot_trace": ["true"], - "skip": ["translated_subs", "hls"] - }, - "youtubepot-bgutilhttp": { - "base_url": ["http://172.17.0.1:4416"] - } - }, - "noprogress": true, - "format_sort": [ - "res", - "ext:mp4:m4a" - ], - "remuxvideo": "mp4", - "nooverwrites": true, - "continuedl": true - } -} +{} diff --git a/ytops_client-source/ytops_client/go_ytdlp_cli/go-ytdlp b/ytops_client-source/ytops_client/go_ytdlp_cli/go-ytdlp deleted file mode 100755 index c1b1267..0000000 Binary files a/ytops_client-source/ytops_client/go_ytdlp_cli/go-ytdlp and /dev/null differ diff --git a/ytops_client-source/ytops_client/go_ytdlp_cli/go.mod b/ytops_client-source/ytops_client/go_ytdlp_cli/go.mod deleted file mode 100644 index 656e2ff..0000000 --- a/ytops_client-source/ytops_client/go_ytdlp_cli/go.mod +++ /dev/null @@ -1,22 +0,0 @@ -module github.com/yourproject/ytops_client/go_ytdlp_cli - -go 1.23.0 - -toolchain go1.24.4 - -require ( - github.com/lrstanley/go-ytdlp v0.0.0-00010101000000-000000000000 - github.com/spf13/cobra v1.8.0 -) - -replace github.com/lrstanley/go-ytdlp => ../../go-ytdlp - -require ( - github.com/ProtonMail/go-crypto v1.3.0 // indirect - github.com/cloudflare/circl v1.6.1 // indirect - github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/spf13/pflag v1.0.5 // indirect - github.com/ulikunitz/xz v0.5.13 // indirect - golang.org/x/crypto v0.41.0 // indirect - golang.org/x/sys v0.35.0 // indirect -) diff --git a/ytops_client-source/ytops_client/go_ytdlp_cli/go.sum b/ytops_client-source/ytops_client/go_ytdlp_cli/go.sum deleted file mode 100644 index 9a2693b..0000000 --- a/ytops_client-source/ytops_client/go_ytdlp_cli/go.sum +++ /dev/null @@ -1,27 +0,0 @@ -github.com/ProtonMail/go-crypto v1.3.0 h1:ILq8+Sf5If5DCpHQp4PbZdS1J7HDFRXz/+xKBiRGFrw= -github.com/ProtonMail/go-crypto v1.3.0/go.mod h1:9whxjD8Rbs29b4XWbB8irEcE8KHMqaR2e7GWU1R+/PE= -github.com/cloudflare/circl v1.6.1 h1:zqIqSPIndyBh1bjLVVDHMPpVKqp8Su/V+6MeDzzQBQ0= -github.com/cloudflare/circl v1.6.1/go.mod h1:uddAzsPgqdMAYatqJ0lsjX1oECcQLIlRpzZh3pJrofs= -github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= -github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/spf13/cobra v1.8.0 h1:7aJaZx1B85qltLMc546zn58BxxfZdR/W22ej9CFoEf0= -github.com/spf13/cobra v1.8.0/go.mod h1:WXLWApfZ71AjXPya3WOlMsY9yMs7YeiHhFVlvLyhcho= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= -github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= -github.com/ulikunitz/xz v0.5.13 h1:ar98gWrjf4H1ev05fYP/o29PDZw9DrI3niHtnEqyuXA= -github.com/ulikunitz/xz v0.5.13/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= -golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= -golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= -golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= -golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/ytops_client-source/ytops_client/go_ytdlp_cli/main.go b/ytops_client-source/ytops_client/go_ytdlp_cli/main.go deleted file mode 100644 index 09a434d..0000000 --- a/ytops_client-source/ytops_client/go_ytdlp_cli/main.go +++ /dev/null @@ -1,64 +0,0 @@ -package main - -import ( - "encoding/json" - "fmt" - "io" - "os" - - "github.com/lrstanley/go-ytdlp" - "github.com/spf13/cobra" -) - -func main() { - cli := &cobra.Command{ - Use: "go-ytdlp", - Short: "A simple CLI wrapper for go-ytdlp.", - SilenceUsage: true, - SilenceErrors: true, - } - - cli.AddCommand(&cobra.Command{ - Use: "flags-to-json [flags...]", - Short: "Converts yt-dlp flags to a JSON config.", - Long: "Converts yt-dlp flags to a JSON config. Note that this does not validate the flags.", - Args: cobra.MinimumNArgs(1), - RunE: func(cmd *cobra.Command, args []string) (err error) { - // The go-ytdlp library documentation mentions FlagsToJSON and SetFlagConfig, - // but these methods are missing from the generated code in the current version. - // Therefore, we cannot implement this command yet. - return fmt.Errorf("flags-to-json is not supported by the underlying go-ytdlp library") - }, - }) - - cli.AddCommand(&cobra.Command{ - Use: "json-to-flags", - Short: "Converts a JSON config to yt-dlp flags.", - Long: "Converts a JSON config to yt-dlp flags. Note that this does not validate the flags. Reads from stdin.", - Args: cobra.NoArgs, - RunE: func(cmd *cobra.Command, args []string) (err error) { - var in []byte - in, err = io.ReadAll(cmd.InOrStdin()) - if err != nil { - return err - } - - // Manually unmarshal into FlagConfig since JSONToFlags helper is missing - var cfg ytdlp.FlagConfig - if err := json.Unmarshal(in, &cfg); err != nil { - return fmt.Errorf("failed to unmarshal JSON: %w", err) - } - - flags := cfg.ToFlags() - for _, flag := range flags { - fmt.Fprintln(cmd.OutOrStdout(), flag) - } - return nil - }, - }) - - if err := cli.Execute(); err != nil { - fmt.Fprintln(os.Stderr, "Error:", err) - os.Exit(1) - } -} diff --git a/ytops_client-source/ytops_client/locking_download_emulator_tool.py b/ytops_client-source/ytops_client/locking_download_emulator_tool.py index 907defe..e31a21c 100644 --- a/ytops_client-source/ytops_client/locking_download_emulator_tool.py +++ b/ytops_client-source/ytops_client/locking_download_emulator_tool.py @@ -12,6 +12,7 @@ import logging import os import sys import time +import threading from copy import deepcopy try: @@ -29,7 +30,7 @@ from ytops_client.profile_manager_tool import ProfileManager from ytops_client.stress_policy import utils as sp_utils from ytops_client.stress_policy.state_manager import StateManager from ytops_client.stress_policy.utils import load_policy, apply_overrides -from ytops_client.stress_policy.workers import _run_download_logic +from ytops_client.stress_policy.worker_utils import _run_download_logic from ytops_client.stress_policy_tool import shutdown_event # Configure logging @@ -164,6 +165,8 @@ def main_locking_download_emulator(args): policy=local_policy, state_manager=dummy_state_manager, args=args, # Pass orchestrator args through + running_processes=set(), # This standalone tool doesn't need to track processes + process_lock=threading.Lock(), profile_name=locked_profile['name'], profile_manager_instance=manager ) diff --git a/ytops_client-source/ytops_client/policy_enforcer_tool.py b/ytops_client-source/ytops_client/policy_enforcer_tool.py index 5caaf35..e7f8f89 100644 --- a/ytops_client-source/ytops_client/policy_enforcer_tool.py +++ b/ytops_client-source/ytops_client/policy_enforcer_tool.py @@ -22,7 +22,8 @@ try: except ImportError: load_dotenv = None -from .profile_manager_tool import ProfileManager +from .profile_manager_tool import ProfileManager, natural_sort_key +from .profile_statemachine import ProfileState # Configure logging logging.basicConfig( @@ -65,14 +66,14 @@ class PolicyEnforcer: all_profiles_map = {p['name']: p for p in all_profiles_list} # Apply profile group policies (rotation, max_active). This will modify the local `all_profiles_map`. - self.enforce_profile_group_policies(getattr(args, 'profile_groups', []), all_profiles_map) + self.enforce_profile_group_policies(getattr(args, 'profile_groups', []), all_profiles_map, args) # Un-rest profiles. This also reads from and modifies the local `all_profiles_map`. self.enforce_unrest_policy(getattr(args, 'profile_groups', []), all_profiles_map, args) # --- Phase 3: Apply policies to individual active profiles --- # Use the now-updated snapshot to determine which profiles are active. - active_profiles = [p for p in all_profiles_map.values() if p['state'] == self.manager.STATE_ACTIVE] + active_profiles = [p for p in all_profiles_map.values() if p['state'] == ProfileState.ACTIVE.value] # Filter out profiles that are managed by a profile group, as their state is handled separately. profile_groups = getattr(args, 'profile_groups', []) @@ -121,7 +122,9 @@ class PolicyEnforcer: reason = f"Error burst detected: {error_count} errors in the last {window_minutes} minute(s) (threshold: {max_failures})" logger.warning(f"Banning profile '{profile['name']}' due to error burst: {reason}") if not self.dry_run: - self.manager.update_profile_state(profile['name'], self.manager.STATE_BANNED, reason) + sm = self.manager.get_state_machine(profile['name']) + if sm: + sm.ban(reason=reason) self.actions_taken_this_cycle += 1 return True # Indicates profile was banned return False @@ -142,17 +145,17 @@ class PolicyEnforcer: reason = f"Rate limit hit: {activity_count} requests in last {window_minutes} minute(s) (limit: {max_requests})" logger.info(f"Resting profile '{profile['name']}' for {rest_duration_minutes}m: {reason}") if not self.dry_run: - self.manager.update_profile_state(profile['name'], self.manager.STATE_RESTING, reason) - rest_until = time.time() + rest_duration_minutes * 60 - self.manager.update_profile_field(profile['name'], 'rest_until', str(rest_until)) + sm = self.manager.get_state_machine(profile['name']) + if sm: + sm.rest(reason=reason, duration_minutes=rest_duration_minutes) self.actions_taken_this_cycle += 1 return True # Indicates profile was rested return False def enforce_unrest_policy(self, profile_groups, all_profiles_map, args): all_profiles_list = list(all_profiles_map.values()) - resting_profiles = [p for p in all_profiles_list if p['state'] == self.manager.STATE_RESTING] - cooldown_profiles = [p for p in all_profiles_list if p['state'] == self.manager.STATE_COOLDOWN] + resting_profiles = [p for p in all_profiles_list if p['state'] == ProfileState.RESTING.value] + cooldown_profiles = [p for p in all_profiles_list if p['state'] == ProfileState.COOLDOWN.value] profiles_to_check = resting_profiles + cooldown_profiles now = time.time() @@ -169,10 +172,10 @@ class PolicyEnforcer: profiles_in_group = [] if 'profiles' in group: - profiles_in_group = sorted(group['profiles']) + profiles_in_group = sorted(group['profiles'], key=natural_sort_key) elif 'prefix' in group: prefix = group['prefix'] - profiles_in_group = sorted([p['name'] for p in all_profiles_list if p['name'].startswith(prefix)]) + profiles_in_group = sorted([p['name'] for p in all_profiles_list if p['name'].startswith(prefix)], key=natural_sort_key) group_to_profiles_map[group_name] = profiles_in_group for p_name in profiles_in_group: @@ -186,13 +189,81 @@ class PolicyEnforcer: count = 0 for p_name in profiles_in_group: profile_state = all_profiles_map.get(p_name, {}).get('state') - if profile_state in [self.manager.STATE_ACTIVE, self.manager.STATE_LOCKED, self.manager.STATE_COOLDOWN]: + if profile_state in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value, ProfileState.COOLDOWN.value]: count += 1 live_active_counts[group_name] = count + logger.debug(f"Initial live active counts: {live_active_counts}") + + # --- New Global Max Active Logic --- + global_max_active = getattr(args, 'global_max_active_profiles', 0) + live_global_active_count = sum(live_active_counts.values()) + if global_max_active > 0: + logger.debug(f"Enforcing global max active profiles limit of {global_max_active}. Current global active: {live_global_active_count}") + # --- End New Global Logic --- + # --- End group logic setup --- + # --- New logic: Determine if the system should wait for downloads --- + is_system_blocked_by_downloads = False + waiting_group_names = set() + if profile_groups: + # Identify all groups that are configured to wait for downloads. + waiting_groups_config = [ + g for g in profile_groups + if g.get('wait_download_finish_per_group') or g.get('wait_download_finish_per_profile') + ] + + if waiting_groups_config: + is_any_group_idle = False + groups_currently_waiting = [] + + for group_config in waiting_groups_config: + group_name = group_config['name'] + profiles_in_group_names = group_to_profiles_map.get(group_name, []) + if not profiles_in_group_names: + continue # Skip empty groups + + has_active_or_locked = any( + all_profiles_map.get(p_name, {}).get('state') in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value] + for p_name in profiles_in_group_names + ) + + total_pending = sum( + all_profiles_map.get(p_name, {}).get('pending_downloads', 0) + for p_name in profiles_in_group_names + ) + + # An "idle" group is one that is not working and has no pending downloads. + # It is ready to start a new work cycle. + if not has_active_or_locked and total_pending == 0: + is_any_group_idle = True + + # A "waiting" group is one that is not working but has pending downloads. + if not has_active_or_locked and total_pending > 0: + groups_currently_waiting.append(group_name) + + waiting_group_names = set(groups_currently_waiting) + + # The system is blocked if no configured group is idle, AND at least one group is actually waiting for downloads. + # This prevents a deadlock if all groups are just 'Working' but none are in the 'waiting_downloads' state yet. + if not is_any_group_idle and groups_currently_waiting: + is_system_blocked_by_downloads = True + logger.info(f"System is waiting for downloads to finish in groups: {groups_currently_waiting}. No new profiles will be activated until a group is free.") + + if is_system_blocked_by_downloads: + # When blocked, we only want to consider profiles that are in the 'waiting_downloads' state, + # as they are the only ones that can unblock the system by finishing their downloads. + # All other resting profiles must wait. + original_count = len(profiles_to_check) + profiles_to_check = [ + p for p in profiles_to_check if p.get('rest_reason') == 'waiting_downloads' + ] + if len(profiles_to_check) != original_count: + logger.debug("Activation is paused for profiles not waiting for downloads.") + # --- End new logic --- + # --- New Sorting Logic based on Profile Selection Strategy --- - strategy = getattr(args, 'profile_selection_strategy', 'round_robin') + strategy = getattr(args, 'profile_selection_strategy', 'longest_idle') if strategy == 'least_loaded' and profile_groups: logger.debug("Applying 'least_loaded' profile selection strategy.") # Separate profiles that are ready from those that are not @@ -206,6 +277,10 @@ class PolicyEnforcer: if group_name: ready_by_group[group_name].append(p) + # Get group states to access timestamps for tie-breaking + all_group_names = list(group_to_profiles_map.keys()) + all_group_states = self.manager.get_profile_group_states(all_group_names) + # Calculate load for each group (sum of pending downloads of all profiles in the group) group_load = {} for group_name, profiles_in_group_names in group_to_profiles_map.items(): @@ -215,31 +290,68 @@ class PolicyEnforcer: ) group_load[group_name] = total_pending - # Sort groups by load, then by name for stability - sorted_groups = sorted(group_load.items(), key=lambda item: (item[1], item[0])) + # --- New Tie-Breaking Logic --- + # Sort groups by load, then by finish time (FIFO for idle groups), then by name for stability + sorted_groups = sorted( + group_load.items(), + key=lambda item: ( + item[1], # Primary sort: pending downloads (load) + # Secondary: prefer group that finished its downloads earliest. + # Use a large number for groups that have never finished, so they go last in a tie. + all_group_states.get(item[0], {}).get('last_finished_downloads_ts', float('inf')), + item[0] # Tertiary: alphabetical name for stability + ) + ) + # --- End New Tie-Breaking Logic --- logger.debug(f"Group load order: {[(name, load) for name, load in sorted_groups]}") # Rebuild the list of ready profiles, ordered by group load sorted_ready_profiles = [] for group_name, load in sorted_groups: profiles_in_group = ready_by_group.get(group_name, []) - # Within a group, sort by rest_until (FIFO) - profiles_in_group.sort(key=lambda p: (p.get('rest_until', 0), p.get('name', ''))) - sorted_ready_profiles.extend(profiles_in_group) + # Within a group, prioritize unused profiles by name, then used profiles by oldest usage. + # An "unused" profile is one with zero session requests. + unused_profiles = [p for p in profiles_in_group if (p.get('success_count', 0) + p.get('failure_count', 0) + p.get('tolerated_error_count', 0) + p.get('download_count', 0) + p.get('download_error_count', 0)) == 0] + used_profiles = [p for p in profiles_in_group if p not in unused_profiles] + + unused_profiles.sort(key=lambda p: natural_sort_key(p.get('name', ''))) + used_profiles.sort(key=lambda p: (p.get('last_used', 0), natural_sort_key(p.get('name', '')))) + + sorted_ready_profiles.extend(unused_profiles + used_profiles) # Add profiles not in any group to the end profiles_not_in_group = [p for p in ready_profiles if not profile_to_group_map.get(p['name'])] - profiles_not_in_group.sort(key=lambda p: (p.get('rest_until', 0), p.get('name', ''))) + # Sort ready profiles not in a group by oldest last_used time first + profiles_not_in_group.sort(key=lambda p: (p.get('last_used', 0), natural_sort_key(p.get('name', '')))) sorted_ready_profiles.extend(profiles_not_in_group) # The final list to check is the sorted ready profiles, followed by the not-ready ones. - not_ready_profiles.sort(key=lambda p: (p.get('rest_until', 0), p.get('name', ''))) + not_ready_profiles.sort(key=lambda p: (p.get('rest_until', 0), natural_sort_key(p.get('name', '')))) profiles_to_check = sorted_ready_profiles + not_ready_profiles - else: # Default FIFO sort - if strategy not in ['round_robin']: - logger.warning(f"Unknown or unhandled profile_selection_strategy '{strategy}'. Defaulting to 'round_robin' (FIFO).") - profiles_to_check.sort(key=lambda p: (p.get('rest_until', 0), p.get('name', ''))) + else: # Default 'longest_idle' sort + if strategy not in ['longest_idle']: + logger.warning(f"Unknown or unhandled profile_selection_strategy '{strategy}'. Defaulting to 'longest_idle'.") + + # Separate profiles that are ready to be activated from those still resting. + # A profile waiting for downloads is NOT considered ready for activation. + ready_profiles = [ + p for p in profiles_to_check + if now >= p.get('rest_until', 0) and p.get('rest_reason') != 'waiting_downloads' + ] + # Profiles still in a timed rest, or waiting for downloads, are not ready. + not_ready_profiles = [p for p in profiles_to_check if p not in ready_profiles] + + # Sort all ready profiles by last_used time to find the longest idle. + # A truly new profile will have a very old 'last_used' time and be selected first. + # A recently used profile will have a new 'last_used' time and be selected last. + ready_profiles.sort(key=lambda p: (p.get('last_used', 0), natural_sort_key(p.get('name', '')))) + + # Sort not-ready profiles by when they will become available (standard FIFO). + not_ready_profiles.sort(key=lambda p: (p.get('rest_until', 0), natural_sort_key(p.get('name', '')))) + + # The final list to check will process all ready profiles first, then wait for the not-ready ones. + profiles_to_check = ready_profiles + not_ready_profiles # --- End New Sorting Logic --- # --- New logic: Identify groups with waiting profiles --- @@ -269,6 +381,12 @@ class PolicyEnforcer: profile_name = profile['name'] group_name = profile_to_group_map.get(profile_name) + # --- New check to prevent activating profiles from a waiting group --- + if group_name in waiting_group_names and profile.get('rest_reason') != 'waiting_downloads': + logger.debug(f"Profile '{profile_name}' activation deferred because its group '{group_name}' is waiting for downloads to complete.") + continue + # --- End new logic --- + # --- New logic: Defer activation if group has a waiting profile --- if group_name in groups_with_waiting_profiles: waiting_profile_name = groups_with_waiting_profiles[group_name] @@ -300,16 +418,31 @@ class PolicyEnforcer: # Transition to a normal post-rotation rest period. new_reason = "Rotation complete (downloads finished)" rest_duration_minutes = group_policy.get('rest_duration_minutes_on_rotation', 0) - rest_until_ts = time.time() + (rest_duration_minutes * 60) if not self.dry_run: - self.manager.update_profile_field(profile['name'], 'rest_reason', new_reason) - self.manager.update_profile_field(profile['name'], 'rest_until', str(rest_until_ts)) - self.manager.clear_pending_downloads(profile_name) - # Reset counters to ensure a fresh start after the wait period. + # The profile is already in RESTING state (for 'waiting_downloads'). + # We cannot call sm.rest() again as it would be a transition from RESTING to RESTING. + # Instead, we manually update the profile's fields in Redis to reflect the + # completion of the wait period and the start of the normal rotation rest. + + # 1. Reset session counters since this is a rotation. self.manager.reset_profile_counters(profile_name) + + # 2. Update rest reason and duration. + rest_until_ts = time.time() + (rest_duration_minutes * 60) if rest_duration_minutes else 0 + self.manager.update_profile_field(profile_name, 'rest_reason', new_reason) + self.manager.update_profile_field(profile_name, 'rest_until', str(rest_until_ts)) + self.manager.update_profile_field(profile_name, 'wait_started_at', '0') + + # 3. Clear the pending downloads counter. + self.manager.clear_pending_downloads(profile_name) + + # Record when this group finished its downloads for FIFO tie-breaking + if group_name: + self.manager.set_profile_group_state(group_name, {'last_finished_downloads_ts': time.time()}) # Update local map so it can be activated in the same cycle if rest_duration is 0 + rest_until_ts = time.time() + (rest_duration_minutes * 60) if rest_duration_minutes else 0 all_profiles_map[profile_name]['rest_reason'] = new_reason all_profiles_map[profile_name]['rest_until'] = rest_until_ts all_profiles_map[profile_name]['success_count'] = 0 @@ -317,6 +450,7 @@ class PolicyEnforcer: all_profiles_map[profile_name]['tolerated_error_count'] = 0 all_profiles_map[profile_name]['download_count'] = 0 all_profiles_map[profile_name]['download_error_count'] = 0 + all_profiles_map[profile_name]['wait_started_at'] = 0 # Let the rest of the unrest logic handle the activation now that rest_until is set. profile['rest_until'] = rest_until_ts # Update profile in loop @@ -330,13 +464,28 @@ class PolicyEnforcer: profile_name = profile['name'] group_name = profile_to_group_map.get(profile_name) + # --- New Global Max Active Check --- + # This check prevents NEW profiles (in RESTING state) from becoming active if the global limit is reached. + # It allows COOLDOWN profiles to become active, as they are already part of the active count. + if global_max_active > 0 and live_global_active_count >= global_max_active and profile['state'] == ProfileState.RESTING.value: + logger.debug(f"Profile '{profile_name}' rest ended, but global max active limit ({global_max_active}) has been reached. Deferring activation.") + continue + # --- End New Global Check --- + # --- Group-aware unrest check --- if group_name: group_policy = next((g for g in profile_groups if g.get('name') == group_name), None) if not group_policy: continue # Should not happen if maps are built correctly - max_active = group_policy.get('max_active_profiles', 1) + max_active_config = group_policy.get('max_active_profiles') + if max_active_config in (0, -1) or max_active_config is None: + max_active = len(group_to_profiles_map.get(group_name, [])) + if max_active == 0: max_active = 1 # Fallback for empty groups + elif isinstance(max_active_config, int) and max_active_config > 0: + max_active = max_active_config + else: + max_active = 1 # Default to 1 on invalid config or for safety # Check if the group is already at its capacity for active profiles. # We use the live counter which is updated during this enforcer cycle. @@ -344,7 +493,7 @@ class PolicyEnforcer: # Special handling for COOLDOWN profiles: they should be allowed to become ACTIVE # even if the group is at capacity, because they are already counted as "active". # We check if the group would be over capacity *without* this profile. - is_cooldown_profile = profile['state'] == self.manager.STATE_COOLDOWN + is_cooldown_profile = profile['state'] == ProfileState.COOLDOWN.value effective_active_count = live_active_counts.get(group_name, 0) # If we are considering a COOLDOWN profile, it's already in the count. @@ -354,6 +503,14 @@ class PolicyEnforcer: if is_cooldown_profile: capacity_check_count -= 1 + logger.debug( + f"Checking capacity for '{profile_name}' in group '{group_name}': " + f"is_cooldown={is_cooldown_profile}, " + f"live_count={effective_active_count}, " + f"check_count={capacity_check_count}, " + f"max_active={max_active}" + ) + if capacity_check_count >= max_active: logger.debug(f"Profile '{profile_name}' rest ended, but group '{group_name}' is at capacity ({effective_active_count}/{max_active}). Deferring activation.") @@ -364,12 +521,13 @@ class PolicyEnforcer: logger.info(f"Profile '{profile_name}' cooldown ended but group is full. Moving to RESTING to wait for a slot.") self.actions_taken_this_cycle += 1 if not self.dry_run: - # We need to update state but keep rest_until in the past. - self.manager.update_profile_state(profile['name'], self.manager.STATE_RESTING, reason) - self.manager.update_profile_field(profile['name'], 'rest_until', '0') + sm = self.manager.get_state_machine(profile['name']) + if sm: + # duration_minutes=0 will set rest_until to 0, making it immediately eligible. + sm.rest(reason=reason, duration_minutes=0) # Update local map - all_profiles_map[profile_name]['state'] = self.manager.STATE_RESTING + all_profiles_map[profile_name]['state'] = ProfileState.RESTING.value all_profiles_map[profile_name]['rest_until'] = 0 all_profiles_map[profile_name]['rest_reason'] = reason @@ -397,7 +555,7 @@ class PolicyEnforcer: proxy_url = profile.get('proxy') if proxy_url: proxy_state_data = proxy_states.get(proxy_url, {}) - if proxy_state_data.get('state') == self.manager.STATE_RESTING: + if proxy_state_data.get('state') == ProfileState.RESTING.value: logger.debug(f"Profile '{profile['name']}' rest period ended, but its proxy '{proxy_url}' is still resting. Deferring activation.") # Update reason for clarity in the UI when a profile is blocked by its proxy. @@ -413,9 +571,12 @@ class PolicyEnforcer: continue # Do not activate this profile yet. # Update group counter BEFORE making any changes, so subsequent checks in this cycle use the updated count - if group_name and profile['state'] == self.manager.STATE_RESTING: + if group_name and profile['state'] == ProfileState.RESTING.value: # For RESTING profiles, they're becoming active, so increment the count live_active_counts[group_name] = live_active_counts.get(group_name, 0) + 1 + # Also increment the global counter + if global_max_active > 0: + live_global_active_count += 1 # COOLDOWN profiles are already counted, no change needed logger.info(f"Activating profile '{profile['name']}' (rest period completed).") @@ -425,15 +586,28 @@ class PolicyEnforcer: is_waiting_after_cooldown = profile.get('rest_reason') == "Waiting for group capacity" if not self.dry_run: - # When un-resting from a long rest, reset counters to give it a fresh start. - # Do not reset for COOLDOWN or a profile that was waiting after cooldown. - self.manager.update_profile_state(profile['name'], self.manager.STATE_ACTIVE, "Rest period completed") - if profile['state'] == self.manager.STATE_RESTING and not is_waiting_after_cooldown: - self.manager.reset_profile_counters(profile['name']) + sm = self.manager.get_state_machine(profile['name']) + if sm: + # Pass the profile object so the action can inspect its old state + # without an extra Redis call. + sm.activate(profile=profile) + + # Log the activation event for observability + reason = f"{profile['state']} completed" + if is_waiting_after_cooldown: + reason = "Activated after waiting" + + log_entry = { + 'ts': now, + 'profile': profile['name'], + 'group': group_name, + 'reason': reason + } + self.manager.log_activation_event(log_entry) # Update the shared map to reflect the change immediately for this cycle. - all_profiles_map[profile_name]['state'] = self.manager.STATE_ACTIVE - if profile['state'] == self.manager.STATE_RESTING and not is_waiting_after_cooldown: + all_profiles_map[profile_name]['state'] = ProfileState.ACTIVE.value + if profile['state'] == ProfileState.RESTING.value and not is_waiting_after_cooldown: all_profiles_map[profile_name]['success_count'] = 0 all_profiles_map[profile_name]['failure_count'] = 0 all_profiles_map[profile_name]['tolerated_error_count'] = 0 @@ -457,7 +631,9 @@ class PolicyEnforcer: reason = f"Global failure rate {current_failure_rate:.2f} >= threshold {max_failure_rate} ({int(failure)}/{int(total)} failures)" logger.warning(f"Banning profile '{profile['name']}' due to high failure rate: {reason}") if not self.dry_run: - self.manager.update_profile_state(profile['name'], self.manager.STATE_BANNED, reason) + sm = self.manager.get_state_machine(profile['name']) + if sm: + sm.ban(reason=reason) self.actions_taken_this_cycle += 1 def enforce_rest_policy(self, profile, rest_after_requests, rest_duration_minutes): @@ -477,9 +653,9 @@ class PolicyEnforcer: logger.info(f"Resting profile '{profile['name']}' for {rest_duration_minutes}m: {reason}") self.actions_taken_this_cycle += 1 if not self.dry_run: - self.manager.update_profile_state(profile['name'], self.manager.STATE_RESTING, reason) - rest_until = time.time() + rest_duration_minutes * 60 - self.manager.update_profile_field(profile['name'], 'rest_until', str(rest_until)) + sm = self.manager.get_state_machine(profile['name']) + if sm: + sm.rest(reason=reason, duration_minutes=rest_duration_minutes) def enforce_stale_lock_cleanup(self, max_lock_seconds): """Finds and unlocks profiles with stale locks.""" @@ -491,12 +667,13 @@ class PolicyEnforcer: if cleaned_count > 0: self.actions_taken_this_cycle += cleaned_count - def enforce_profile_group_policies(self, profile_groups, all_profiles_map): + def enforce_profile_group_policies(self, profile_groups, all_profiles_map, args): """ Manages profiles within defined groups. This includes: 1. Rotating out profiles that have met their request limit. 2. Healing the group by ensuring no more than `max_active_profiles` are active. 3. Initializing the group by activating a profile if none are active. + 4. Healing across all groups to enforce `global_max_active_profiles`. This method operates on and modifies the `all_profiles_map` passed to it. """ @@ -520,8 +697,18 @@ class PolicyEnforcer: if not profiles_in_group: logger.warning(f"Profile group '{group_name}' has no matching profiles. Skipping.") + if not self.dry_run: + # Clean up stale group state from Redis if it exists + self.manager.redis.delete(self.manager._profile_group_state_key(group_name)) continue + # --- Calculate and persist group load for observability --- + total_pending_downloads = sum( + all_profiles_map.get(p_name, {}).get('pending_downloads', 0) + for p_name in profiles_in_group + ) + logger.debug(f"Group '{group_name}': Calculated pending_downloads={total_pending_downloads}. Persisting to Redis.") + # --- Persist group policy to Redis for observability --- rotate_after_requests = group.get('rotate_after_requests') max_active_profiles = group.get('max_active_profiles') @@ -531,7 +718,8 @@ class PolicyEnforcer: self.manager.set_profile_group_state(group_name, { 'rotate_after_requests': rotate_after_requests, 'max_active_profiles': max_active_profiles, - 'prefix': group.get('prefix') # Store prefix for observability + 'prefix': group.get('prefix'), # Store prefix for observability + 'pending_downloads': total_pending_downloads }) # --- 1. Handle Rotation for Active Profiles --- @@ -540,7 +728,7 @@ class PolicyEnforcer: # Consider ACTIVE, LOCKED, and COOLDOWN profiles for rotation eligibility. eligible_for_rotation_check = [ p for p in all_profiles_list - if p['name'] in profiles_in_group and p['state'] in [self.manager.STATE_ACTIVE, self.manager.STATE_LOCKED, self.manager.STATE_COOLDOWN] + if p['name'] in profiles_in_group and p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value, ProfileState.COOLDOWN.value] ] for profile in eligible_for_rotation_check: @@ -554,7 +742,7 @@ class PolicyEnforcer: if total_requests >= rotate_after_requests: # If a profile is LOCKED, we can't rotate it yet. # Instead, we update its reason to show that a rotation is pending. - if profile['state'] == self.manager.STATE_LOCKED: + if profile['state'] == ProfileState.LOCKED.value: pending_reason = f"Pending Rotation (requests: {total_requests}/{rotate_after_requests})" # Only update if the reason is not already set, to avoid spamming Redis. if profile.get('reason') != pending_reason: @@ -562,6 +750,8 @@ class PolicyEnforcer: self.actions_taken_this_cycle += 1 if not self.dry_run: self.manager.update_profile_field(profile['name'], 'reason', pending_reason) + # Update local map + all_profiles_map[profile['name']]['reason'] = pending_reason else: logger.debug(f"Profile '{profile['name']}' in group '{group_name}' is due for rotation but is currently LOCKED. Already marked as pending.") continue @@ -571,36 +761,41 @@ class PolicyEnforcer: logger.info(f"Rotating profile '{profile['name']}' in group '{group_name}': {reason}") self.actions_taken_this_cycle += 1 - wait_for_downloads = group.get('wait_download_finish_per_profile', False) - + wait_for_downloads_group = group.get('wait_download_finish_per_group', False) + # For backward compatibility + wait_for_downloads_profile = group.get('wait_download_finish_per_profile', False) + new_reason = reason rest_until_ts = 0 + is_waiting_profile = False - if wait_for_downloads: + if wait_for_downloads_group or wait_for_downloads_profile: + is_waiting_profile = True new_reason = "waiting_downloads" - logger.info(f"Profile '{profile['name']}' will wait for pending downloads to complete.") + if wait_for_downloads_group: + logger.info(f"Profile '{profile['name']}' rotated. Group '{group_name}' will wait for all downloads before yielding to other groups.") + else: # wait_for_downloads_profile must be true + logger.info(f"Profile '{profile['name']}' will wait for pending downloads to complete.") else: rest_duration_minutes = group.get('rest_duration_minutes_on_rotation') if rest_duration_minutes and rest_duration_minutes > 0: rest_until_ts = time.time() + rest_duration_minutes * 60 if not self.dry_run: - self.manager.update_profile_state(profile['name'], self.manager.STATE_RESTING, new_reason) - - if wait_for_downloads: - self.manager.update_profile_field(profile['name'], 'wait_started_at', str(time.time())) - # Set rest_until to 0 to indicate it's not a time-based rest - self.manager.update_profile_field(profile['name'], 'rest_until', '0') - elif rest_until_ts > 0: - self.manager.update_profile_field(profile['name'], 'rest_until', str(rest_until_ts)) - - # Reset all session counters for the next active cycle - self.manager.reset_profile_counters(profile['name']) + sm = self.manager.get_state_machine(profile['name']) + if sm: + rest_duration_minutes = group.get('rest_duration_minutes_on_rotation') + sm.rest( + reason=new_reason, + duration_minutes=rest_duration_minutes, + is_waiting_profile=is_waiting_profile, + is_rotation=True + ) # Update our local map so subsequent policies in this cycle see the change immediately. - all_profiles_map[profile['name']]['state'] = self.manager.STATE_RESTING + all_profiles_map[profile['name']]['state'] = ProfileState.RESTING.value all_profiles_map[profile['name']]['rest_reason'] = new_reason - if wait_for_downloads: + if is_waiting_profile: all_profiles_map[profile['name']]['wait_started_at'] = time.time() all_profiles_map[profile['name']]['rest_until'] = 0 else: @@ -612,13 +807,21 @@ class PolicyEnforcer: all_profiles_map[profile['name']]['download_error_count'] = 0 # --- 2. Self-Healing: Enforce max_active_profiles --- - max_active = group.get('max_active_profiles', 1) + max_active_config = group.get('max_active_profiles') + + if max_active_config in (0, -1) or max_active_config is None: + max_active = len(profiles_in_group) + if max_active == 0: max_active = 1 # Fallback for empty groups + elif isinstance(max_active_config, int) and max_active_config > 0: + max_active = max_active_config + else: + max_active = 1 # Default to 1 on invalid config or for safety # Get the current list of active/locked profiles from our potentially modified local map # A profile is considered "active" for group limits if it is ACTIVE, LOCKED, or in COOLDOWN. current_active_or_locked_profiles = [ p for name, p in all_profiles_map.items() - if name in profiles_in_group and p['state'] in [self.manager.STATE_ACTIVE, self.manager.STATE_LOCKED, self.manager.STATE_COOLDOWN] + if name in profiles_in_group and p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value, ProfileState.COOLDOWN.value] ] num_active_or_locked = len(current_active_or_locked_profiles) @@ -626,13 +829,13 @@ class PolicyEnforcer: logger.warning(f"Healing group '{group_name}': Found {num_active_or_locked} active/locked profiles, but max is {max_active}. Resting excess ACTIVE profiles.") # We can only rest profiles that are in the ACTIVE state, not LOCKED. - profiles_that_can_be_rested = [p for p in current_active_or_locked_profiles if p['state'] == self.manager.STATE_ACTIVE] + profiles_that_can_be_rested = [p for p in current_active_or_locked_profiles if p['state'] == ProfileState.ACTIVE.value] # Sort to determine which profiles to rest. We prefer to rest profiles # that have been used more. As a tie-breaker (especially for profiles - # with 0 requests), we rest the one that has been active the longest - # (oldest last_used timestamp). - profiles_that_can_be_rested.sort(key=lambda p: p.get('last_used', 0)) # Oldest first + # with 0 requests), we rest profiles with higher names first to ensure + # deterministic behavior (e.g., user1_2 is rested before user1_1). + profiles_that_can_be_rested.sort(key=lambda p: natural_sort_key(p.get('name', '')), reverse=True) # Higher name first profiles_that_can_be_rested.sort(key=lambda p: ( p.get('success_count', 0) + p.get('failure_count', 0) + p.get('tolerated_error_count', 0) + @@ -651,18 +854,22 @@ class PolicyEnforcer: logger.warning(f"Healing group '{group_name}': Resting profile '{profile['name']}' (request count: {req_count}).") self.actions_taken_this_cycle += 1 if not self.dry_run: - self.manager.update_profile_state(profile['name'], self.manager.STATE_RESTING, "Group max_active healing") - # Give it a rest time of 0, so it's immediately eligible for activation - # by the unrest logic if the group has capacity. - rest_until_ts = 0 - self.manager.update_profile_field(profile['name'], 'rest_until', str(rest_until_ts)) + sm = self.manager.get_state_machine(profile['name']) + if sm: + # Rest for a minimal duration to prevent immediate re-activation in the same cycle. + sm.rest(reason="Group max_active healing", duration_minutes=0.02) # ~1.2 seconds + + # Update local map to reflect the change for this cycle + all_profiles_map[profile['name']]['state'] = ProfileState.RESTING.value + all_profiles_map[profile['name']]['rest_reason'] = "Group max_active healing" + all_profiles_map[profile['name']]['rest_until'] = time.time() + (0.02 * 60) # --- 3. Initialization: Activate profiles if below capacity --- # This is a fallback for initialization or if all profiles were rested/banned. # The primary activation mechanism is in `enforce_unrest_policy`. elif num_active_or_locked < max_active: # Check if there are any non-active, non-banned, non-locked profiles to activate. - eligible_profiles = [p for name, p in all_profiles_map.items() if name in profiles_in_group and p['state'] not in [self.manager.STATE_ACTIVE, self.manager.STATE_BANNED, self.manager.STATE_LOCKED]] + eligible_profiles = [p for name, p in all_profiles_map.items() if name in profiles_in_group and p['state'] not in [ProfileState.ACTIVE.value, ProfileState.BANNED.value, ProfileState.LOCKED.value]] if eligible_profiles: # This is a simple initialization case. We don't activate here because # `enforce_unrest_policy` will handle it more intelligently based on rest times. @@ -670,6 +877,58 @@ class PolicyEnforcer: if num_active_or_locked == 0: logger.debug(f"Group '{group_name}' has no active profiles. `enforce_unrest_policy` will attempt to activate one.") + # --- 4. Global Self-Healing: Enforce global_max_active_profiles --- + # This runs after all per-group healing and ensures the global limit is respected. + global_max_active = getattr(args, 'global_max_active_profiles', 0) + if global_max_active > 0: + # Get all profiles managed by any group + all_grouped_profiles = set() + for group in profile_groups: + profiles_in_group = set() + if 'profiles' in group: + profiles_in_group = set(group['profiles']) + elif 'prefix' in group: + prefix = group['prefix'] + profiles_in_group = {p['name'] for p in all_profiles_list if p['name'].startswith(prefix)} + all_grouped_profiles.update(profiles_in_group) + + # Get current active count across all groups from our local map + current_global_active = [ + p for name, p in all_profiles_map.items() + if name in all_grouped_profiles and p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value, ProfileState.COOLDOWN.value] + ] + + num_global_active = len(current_global_active) + if num_global_active > global_max_active: + logger.warning(f"Global Healing: Found {num_global_active} active profiles across all groups, but global max is {global_max_active}. Resting excess.") + + # We can only rest profiles that are in the ACTIVE state, not LOCKED. + profiles_that_can_be_rested = [p for p in current_global_active if p['state'] == ProfileState.ACTIVE.value] + + # Sort to determine which profiles to rest, using the same logic as per-group healing. + profiles_that_can_be_rested.sort(key=lambda p: natural_sort_key(p.get('name', '')), reverse=True) # Higher name first + profiles_that_can_be_rested.sort(key=lambda p: ( + p.get('success_count', 0) + p.get('failure_count', 0) + + p.get('tolerated_error_count', 0) + + p.get('download_count', 0) + p.get('download_error_count', 0) + ), reverse=True) # Most requests first + + num_to_rest = num_global_active - global_max_active + profiles_to_rest = profiles_that_can_be_rested[:num_to_rest] + for profile in profiles_to_rest: + logger.warning(f"Global Healing: Resting profile '{profile['name']}'.") + self.actions_taken_this_cycle += 1 + if not self.dry_run: + sm = self.manager.get_state_machine(profile['name']) + if sm: + # Rest for a minimal duration to prevent immediate re-activation in the same cycle. + sm.rest(reason="Global max_active healing", duration_minutes=0.02) # ~1.2 seconds + + # Update local map to reflect the change for this cycle + all_profiles_map[profile['name']]['state'] = ProfileState.RESTING.value + all_profiles_map[profile['name']]['rest_reason'] = "Global max_active healing" + all_profiles_map[profile['name']]['rest_until'] = time.time() + (0.02 * 60) + def enforce_proxy_group_rotation(self, proxy_groups): """Manages mutually exclusive work cycles for proxies within defined groups.""" if not proxy_groups: @@ -714,11 +973,11 @@ class PolicyEnforcer: if not self.dry_run: # Activate the first, rest the others - self.manager.set_proxy_state(proxies_in_group[0], self.manager.STATE_ACTIVE) + self.manager.set_proxy_state(proxies_in_group[0], ProfileState.ACTIVE.value) for i, proxy_url in enumerate(proxies_in_group): if i != active_proxy_index: # Rest indefinitely; group logic will activate it when its turn comes. - self.manager.set_proxy_state(proxy_url, self.manager.STATE_RESTING, rest_duration_minutes=99999) + self.manager.set_proxy_state(proxy_url, ProfileState.RESTING.value, rest_duration_minutes=99999) self.manager.set_proxy_group_state(group_name, active_proxy_index, next_rotation_ts) @@ -737,9 +996,9 @@ class PolicyEnforcer: if not self.dry_run: # Rest the old proxy - self.manager.set_proxy_state(old_active_proxy, self.manager.STATE_RESTING, rest_duration_minutes=99999) + self.manager.set_proxy_state(old_active_proxy, ProfileState.RESTING.value, rest_duration_minutes=99999) # Activate the new one - self.manager.set_proxy_state(new_active_proxy, self.manager.STATE_ACTIVE) + self.manager.set_proxy_state(new_active_proxy, ProfileState.ACTIVE.value) # Update group state self.manager.set_proxy_group_state(group_name, next_active_index, next_rotation_ts) @@ -775,44 +1034,48 @@ class PolicyEnforcer: now = time.time() for proxy_url, state_data in proxy_states.items(): - state = state_data.get('state', self.manager.STATE_ACTIVE) + state = state_data.get('state', ProfileState.ACTIVE.value) # Un-rest logic - if state == self.manager.STATE_RESTING: + if state == ProfileState.RESTING.value: rest_until = state_data.get('rest_until', 0) if now >= rest_until: logger.info(f"Activating proxy '{proxy_url}' (rest period complete).") self.actions_taken_this_cycle += 1 if not self.dry_run: - self.manager.set_proxy_state(proxy_url, self.manager.STATE_ACTIVE) + self.manager.set_proxy_state(proxy_url, ProfileState.ACTIVE.value) # Also activate any profiles that were resting due to this proxy profiles_for_proxy = [p for p in all_profiles if p.get('proxy') == proxy_url] for profile in profiles_for_proxy: - if profile['state'] == self.manager.STATE_RESTING and profile.get('rest_reason') == self.PROXY_REST_REASON: + if profile['state'] == ProfileState.RESTING.value and profile.get('rest_reason') == self.PROXY_REST_REASON: logger.info(f"Activating profile '{profile['name']}' as its proxy '{proxy_url}' is now active.") self.actions_taken_this_cycle += 1 if not self.dry_run: - self.manager.update_profile_state(profile['name'], self.manager.STATE_ACTIVE, "Proxy activated") + sm = self.manager.get_state_machine(profile['name']) + if sm: + sm.activate() else: # Proxy is still resting. Ensure any of its profiles that are ACTIVE are moved to RESTING. # This catches profiles that were unlocked while their proxy was resting. rest_until_ts = state_data.get('rest_until', 0) profiles_for_proxy = [p for p in all_profiles if p.get('proxy') == proxy_url] for profile in profiles_for_proxy: - if profile['state'] == self.manager.STATE_ACTIVE: + if profile['state'] == ProfileState.ACTIVE.value: logger.info(f"Resting profile '{profile['name']}' as its proxy '{proxy_url}' is resting.") self.actions_taken_this_cycle += 1 if not self.dry_run: - self.manager.update_profile_state(profile['name'], self.manager.STATE_RESTING, self.PROXY_REST_REASON) - self.manager.update_profile_field(profile['name'], 'rest_until', str(rest_until_ts)) + duration_minutes = (rest_until_ts - time.time()) / 60 + sm = self.manager.get_state_machine(profile['name']) + if sm: + sm.rest(reason=self.PROXY_REST_REASON, duration_minutes=max(0, duration_minutes)) # Rest logic - elif state == self.manager.STATE_ACTIVE: + elif state == ProfileState.ACTIVE.value: work_start = state_data.get('work_start_timestamp', 0) if work_start == 0: # Proxy was just created, start its work cycle if not self.dry_run: - self.manager.set_proxy_state(proxy_url, self.manager.STATE_ACTIVE) + self.manager.set_proxy_state(proxy_url, ProfileState.ACTIVE.value) continue work_duration_seconds = work_minutes * 60 @@ -824,17 +1087,19 @@ class PolicyEnforcer: rest_until_ts = time.time() + rest_minutes * 60 if not self.dry_run: - self.manager.set_proxy_state(proxy_url, self.manager.STATE_RESTING, rest_minutes) + self.manager.set_proxy_state(proxy_url, ProfileState.RESTING.value, rest_minutes) # Also rest any active profiles using this proxy profiles_for_proxy = [p for p in all_profiles if p.get('proxy') == proxy_url] for profile in profiles_for_proxy: - if profile['state'] == self.manager.STATE_ACTIVE: + if profile['state'] == ProfileState.ACTIVE.value: logger.info(f"Resting profile '{profile['name']}' as its proxy '{proxy_url}' is resting.") self.actions_taken_this_cycle += 1 if not self.dry_run: - self.manager.update_profile_state(profile['name'], self.manager.STATE_RESTING, self.PROXY_REST_REASON) - self.manager.update_profile_field(profile['name'], 'rest_until', str(rest_until_ts)) + duration_minutes = (rest_until_ts - time.time()) / 60 + sm = self.manager.get_state_machine(profile['name']) + if sm: + sm.rest(reason=self.PROXY_REST_REASON, duration_minutes=max(0, duration_minutes)) def enforce_max_proxy_active_time(self, args): """ @@ -859,7 +1124,7 @@ class PolicyEnforcer: now = time.time() for proxy_url, state_data in proxy_states.items(): - if state_data.get('state') == self.manager.STATE_ACTIVE: + if state_data.get('state') == ProfileState.ACTIVE.value: work_start = state_data.get('work_start_timestamp', 0) if work_start == 0: continue # Just activated, timestamp not set yet. @@ -874,17 +1139,19 @@ class PolicyEnforcer: rest_until_ts = now + rest_minutes * 60 if not self.dry_run: - self.manager.set_proxy_state(proxy_url, self.manager.STATE_RESTING, rest_minutes) + self.manager.set_proxy_state(proxy_url, ProfileState.RESTING.value, rest_minutes) # Also rest any active profiles using this proxy profiles_for_proxy = [p for p in all_profiles if p.get('proxy') == proxy_url] for profile in profiles_for_proxy: - if profile['state'] == self.manager.STATE_ACTIVE: + if profile['state'] == ProfileState.ACTIVE.value: logger.info(f"Resting profile '{profile['name']}' as its proxy '{proxy_url}' is resting due to max active time.") self.actions_taken_this_cycle += 1 if not self.dry_run: - self.manager.update_profile_state(profile['name'], self.manager.STATE_RESTING, self.PROXY_REST_REASON) - self.manager.update_profile_field(profile['name'], 'rest_until', str(rest_until_ts)) + duration_minutes = (rest_until_ts - time.time()) / 60 + sm = self.manager.get_state_machine(profile['name']) + if sm: + sm.rest(reason=self.PROXY_REST_REASON, duration_minutes=max(0, duration_minutes)) def enforce_proxy_policies(self, args): proxy_ban_enabled = args.proxy_ban_on_failures and args.proxy_ban_on_failures > 0 @@ -936,8 +1203,10 @@ class PolicyEnforcer: if not self.dry_run: for profile in profiles_for_proxy: # Don't re-ban already banned profiles - if profile['state'] != self.manager.STATE_BANNED: - self.manager.update_profile_state(profile['name'], self.manager.STATE_BANNED, reason) + if profile['state'] != ProfileState.BANNED.value: + sm = self.manager.get_state_machine(profile['name']) + if sm: + sm.ban(reason=reason) return True # Indicates action was taken return False @@ -958,7 +1227,7 @@ class PolicyEnforcer: self.actions_taken_this_cycle += 1 if not self.dry_run: - self.manager.set_proxy_state(proxy_url, self.manager.STATE_RESTING, rest_duration_minutes) + self.manager.set_proxy_state(proxy_url, ProfileState.RESTING.value, rest_duration_minutes) return True # Indicates action was taken return False @@ -976,6 +1245,7 @@ def add_policy_enforcer_parser(subparsers): parser.add_argument('--redis-host', default=None, help='Redis host. Defaults to REDIS_HOST or MASTER_HOST_IP env var, or localhost.') parser.add_argument('--redis-port', type=int, default=None, help='Redis port. Defaults to REDIS_PORT env var, or 6379.') parser.add_argument('--redis-password', default=None, help='Redis password. Defaults to REDIS_PASSWORD env var.') + parser.add_argument('--redis-db', type=int, default=None, help='Redis DB number. Defaults to REDIS_DB env var, or 0.') parser.add_argument('--env', default=None, help="Default environment name for Redis key prefix. Used if --auth-env or --download-env are not specified. Overrides policy file setting.") parser.add_argument('--auth-env', help="Override the environment for the Auth simulation.") parser.add_argument('--download-env', help="Override the environment for the Download simulation.") @@ -1080,17 +1350,22 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F auth_reason = auth_profile.get('reason', '') # If auth profile is waiting for downloads, we must NOT sync the RESTING state to the download profile, # as that would prevent it from processing the very downloads we are waiting for. - if auth_state == auth_manager.STATE_RESTING and auth_reason == 'waiting_downloads': + if auth_state == ProfileState.RESTING.value and auth_reason == 'waiting_downloads': logger.debug(f"Auth profile '{auth_profile['name']}' is waiting for downloads. Skipping state sync to download profile to prevent deadlock.") else: logger.info(f"Syncing download profile '{download_profile_name}' to state '{auth_state}' (auth lead)") if not dry_run: + sm = download_manager.get_state_machine(download_profile_name) + if not sm: continue + reason_to_sync = auth_reason or 'Synced from auth' - download_manager.update_profile_state(download_profile_name, auth_state, f"Synced from auth: {reason_to_sync}") - if auth_state == auth_manager.STATE_RESTING: - auth_rest_until = auth_profile.get('rest_until') - if auth_rest_until: - download_manager.update_profile_field(download_profile_name, 'rest_until', str(auth_rest_until)) + + if auth_state == ProfileState.BANNED.value: + sm.ban(reason=f"Synced from auth: {reason_to_sync}") + elif auth_state == ProfileState.RESTING.value: + auth_rest_until = auth_profile.get('rest_until', 0) + duration_minutes = max(0, (auth_rest_until - time.time()) / 60) + sm.rest(reason=f"Synced from auth: {reason_to_sync}", duration_minutes=duration_minutes) # Handle rotation sync if sync_rotation: @@ -1100,14 +1375,16 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F # as that would prevent it from processing the very downloads we are waiting for. if auth_reason == 'waiting_downloads': logger.debug(f"Auth profile '{auth_profile['name']}' is waiting for downloads. Skipping rotation sync to download profile to prevent deadlock.") - elif auth_state == auth_manager.STATE_RESTING and 'rotate' in auth_reason.lower(): - if download_state != download_manager.STATE_RESTING: + elif auth_state == ProfileState.RESTING.value and 'rotate' in auth_reason.lower(): + if download_state != ProfileState.RESTING.value: logger.info(f"Rotating download profile '{download_profile_name}' due to auth rotation") if not dry_run: - download_manager.update_profile_state(download_profile_name, download_manager.STATE_RESTING, f"Rotated due to auth rotation: {auth_reason}") - auth_rest_until = auth_profile.get('rest_until') - if auth_rest_until: - download_manager.update_profile_field(download_profile_name, 'rest_until', str(auth_rest_until)) + sm = download_manager.get_state_machine(download_profile_name) + if not sm: continue + + auth_rest_until = auth_profile.get('rest_until', 0) + duration_minutes = max(0, (auth_rest_until - time.time()) / 60) + sm.rest(reason=f"Rotated due to auth rotation: {auth_reason}", duration_minutes=duration_minutes, is_rotation=True) # --- Active Profile Sync --- sync_active = sync_config.get('sync_active_profile', False) @@ -1123,14 +1400,14 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F # 1. Add profiles that are active in auth simulation (if sync_active is enabled) if sync_active: - active_auth_profiles = [p for p in all_auth_profiles.values() if p['state'] in [auth_manager.STATE_ACTIVE, auth_manager.STATE_LOCKED]] + active_auth_profiles = [p for p in all_auth_profiles.values() if p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]] for auth_profile in active_auth_profiles: target_active_download_profiles.add(auth_profile['name']) # 2. Add profiles that are waiting for downloads to complete (if sync_waiting_downloads is enabled) if sync_waiting_downloads: waiting_auth_profiles = [p for p in all_auth_profiles.values() - if p['state'] == auth_manager.STATE_RESTING + if p['state'] == ProfileState.RESTING.value and p.get('rest_reason') == 'waiting_downloads'] for auth_profile in waiting_auth_profiles: target_active_download_profiles.add(auth_profile['name']) @@ -1138,7 +1415,9 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F if not target_active_download_profiles: logger.debug("No auth profiles found that need active download profiles.") - return + # If no auth profiles need download profiles active, we should rest ALL download profiles + # to prevent them from being used when they shouldn't be. + # This is handled by the deactivation logic below (target_download_groups will be empty). # Get download profile group info from Redis dl_group_state_keys = [k for k in download_manager.redis.scan_iter(f"{download_manager.key_prefix}profile_group_state:*")] @@ -1160,25 +1439,41 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F logger.warning(f"Auth profile '{target_profile_name}' needs an active download profile, but no corresponding download profile found.") continue - if download_profile['state'] not in [download_manager.STATE_ACTIVE, download_manager.STATE_LOCKED]: - is_from_cooldown = download_profile['state'] == download_manager.STATE_COOLDOWN + if download_profile['state'] not in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]: + is_from_cooldown = download_profile['state'] == ProfileState.COOLDOWN.value log_msg_suffix = " (from COOLDOWN)" if is_from_cooldown else "" logger.info(f"Syncing active state: Activating download profile '{target_profile_name}' to match auth requirements{log_msg_suffix}.") if not dry_run: - download_manager.update_profile_state(target_profile_name, download_manager.STATE_ACTIVE, "Synced from auth requirements") - # Only reset counters if it's coming from a long-term rest, not a short cooldown. - if not is_from_cooldown: - download_manager.reset_profile_counters(target_profile_name) + sm = download_manager.get_state_machine(target_profile_name) + if sm: + # Pass profile so action can decide whether to reset counters + sm.activate(profile=download_profile) - # Deactivate any download profiles that are active but shouldn't be + # --- Group-Aware Deactivation --- + # Identify the target download groups based on target_active_download_profiles. + # CRITICAL FIX: Directly map individual profiles to their groups instead of relying on name patterns. + target_download_groups = set() + + for target_profile_name in target_active_download_profiles: + group_info = dl_profile_to_group.get(target_profile_name) + if group_info: + target_download_groups.add(group_info['name']) + + logger.debug(f"Target download groups for this sync cycle: {target_download_groups}") + + # Deactivate any download profiles that are active but are not in a target group for dl_profile_name, dl_profile in all_download_profiles.items(): - if dl_profile['state'] == download_manager.STATE_ACTIVE and dl_profile_name not in target_active_download_profiles: + if dl_profile['state'] == ProfileState.ACTIVE.value: group_info = dl_profile_to_group.get(dl_profile_name) - if group_info: - logger.info(f"Syncing active state: Resting download profile '{dl_profile_name}' as it is no longer the active profile in its group.") + # If the profile is in a group, and that group is NOT a target group, rest it. + if group_info and group_info['name'] not in target_download_groups: + logger.info(f"Syncing active state: Resting download profile '{dl_profile_name}' as its group '{group_info['name']}' is no longer active.") if not dry_run: - download_manager.update_profile_state(dl_profile_name, download_manager.STATE_RESTING, "Synced rotation from auth") - download_manager.update_profile_field(dl_profile_name, 'rest_until', '0') + sm = download_manager.get_state_machine(dl_profile_name) + if sm: + # Use a very long duration to prevent immediate re-activation by enforce_unrest_policy. + # The sync logic will wake it up when needed. + sm.rest(reason="Synced rotation from auth", duration_minutes=999999) def main_policy_enforcer(args): """Main dispatcher for 'policy-enforcer' command.""" @@ -1218,27 +1513,32 @@ def main_policy_enforcer(args): 'unlock_stale_locks_after_seconds': 120, 'unlock_cooldown_seconds': 0, 'max_global_proxy_active_minutes': 0, 'rest_duration_on_max_active': 10, - 'profile_selection_strategy': 'round_robin', + 'profile_selection_strategy': 'longest_idle', + 'global_max_active_profiles': 0, 'interval_seconds': 60, 'proxy_groups': [], 'profile_groups': [] } sim_params = policy.get('simulation_parameters', {}) - env_file_from_policy = sim_params.get('env_file') - + if load_dotenv: + env_file_from_policy = sim_params.get('env_file') env_file = args.env_file or env_file_from_policy + if not env_file and args.env and '.env' in args.env and os.path.exists(args.env): print(f"WARNING: --env should be an environment name, not a file path. Treating '{args.env}' as --env-file.", file=sys.stderr) env_file = args.env - if env_file and load_dotenv(env_file): - print(f"Loaded environment variables from {env_file}", file=sys.stderr) - elif args.env_file and not os.path.exists(args.env_file): + + was_loaded = load_dotenv(env_file) + if was_loaded: + print(f"Loaded environment variables from {env_file or '.env file'}", file=sys.stderr) + elif args.env_file: # This only runs if was_loaded is False AND args.env_file was provided print(f"ERROR: The specified --env-file was not found: {args.env_file}", file=sys.stderr) return 1 redis_host = args.redis_host or os.getenv('REDIS_HOST', os.getenv('MASTER_HOST_IP', 'localhost')) redis_port = args.redis_port if args.redis_port is not None else int(os.getenv('REDIS_PORT', 6379)) redis_password = args.redis_password or os.getenv('REDIS_PASSWORD') + redis_db = args.redis_db if args.redis_db is not None else int(os.getenv('REDIS_DB', 0)) if args.verbose: logging.getLogger().setLevel(logging.DEBUG) @@ -1277,7 +1577,7 @@ def main_policy_enforcer(args): elif args.legacy: key_prefix = 'profile_mgmt_' else: key_prefix = f"{effective_env}_profile_mgmt_" - manager = ProfileManager(redis_host, redis_port, redis_password, key_prefix) + manager = ProfileManager(redis_host, redis_port, redis_password, key_prefix, redis_db) enforcer = PolicyEnforcer(manager, dry_run=args.dry_run) # Write any relevant config to Redis for workers to use @@ -1293,6 +1593,11 @@ def main_policy_enforcer(args): proxy_rest_duration_minutes = getattr(config, 'proxy_rest_duration_minutes', None) if proxy_rest_duration_minutes is not None and not args.dry_run: manager.set_config('proxy_rest_duration_minutes', proxy_rest_duration_minutes) + + # Write profile_selection_strategy to Redis for observability + strategy = getattr(config, 'profile_selection_strategy', None) + if strategy and not args.dry_run: + manager.set_config('profile_selection_strategy', strategy) return {'name': sim_type, 'enforcer': enforcer, 'config': config} diff --git a/ytops_client-source/ytops_client/profile_manager_tool.py b/ytops_client-source/ytops_client/profile_manager_tool.py index 2ef11b2..2b9eea0 100644 --- a/ytops_client-source/ytops_client/profile_manager_tool.py +++ b/ytops_client-source/ytops_client/profile_manager_tool.py @@ -10,6 +10,7 @@ import io import logging import os import random +import re import signal import sys import threading @@ -20,6 +21,8 @@ import collections import redis +from .profile_statemachine import ProfileState, ProfileStateMachine + try: from dotenv import load_dotenv except ImportError: @@ -48,32 +51,53 @@ def handle_shutdown(sig, frame): shutdown_event.set() +def natural_sort_key(s: str) -> List[Any]: + """Key for natural sorting (e.g., 'user10' comes after 'user2').""" + return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)] + + +def format_timestamp(ts: float) -> str: + """Format timestamp for display.""" + if not ts or ts == 0: + return "Never" + return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') + + +def format_duration(seconds: float) -> str: + """Format duration for display.""" + if seconds < 60: + return f"{seconds:.0f}s" + elif seconds < 3600: + return f"{seconds/60:.1f}m" + elif seconds < 86400: + return f"{seconds/3600:.1f}h" + else: + return f"{seconds/86400:.1f}d" + + class ProfileManager: """Manages profiles in Redis with configurable prefix.""" - # Profile states - STATE_ACTIVE = "ACTIVE" - STATE_PAUSED = "PAUSED" - STATE_RESTING = "RESTING" - STATE_BANNED = "BANNED" - STATE_LOCKED = "LOCKED" - STATE_COOLDOWN = "COOLDOWN" - - VALID_STATES = [STATE_ACTIVE, STATE_PAUSED, STATE_RESTING, STATE_BANNED, STATE_LOCKED, STATE_COOLDOWN] + # Profile states are defined in the ProfileState enum. + VALID_STATES = ProfileState.values() def __init__(self, redis_host='localhost', redis_port=6379, - redis_password=None, key_prefix='profile_mgmt_'): + redis_password=None, key_prefix='profile_mgmt_', redis_db=0): """Initialize Redis connection and key prefix.""" self.key_prefix = key_prefix - logger.info(f"Attempting to connect to Redis at {redis_host}:{redis_port}...") + logger.info(f"Attempting to connect to Redis at {redis_host}:{redis_port} (DB: {redis_db})...") try: self.redis = redis.Redis( host=redis_host, port=redis_port, password=redis_password, + db=redis_db, decode_responses=True, - socket_connect_timeout=5, - socket_timeout=5 + socket_connect_timeout=10, + socket_timeout=30, + socket_keepalive=True, + retry_on_timeout=True, + max_connections=10 ) self.redis.ping() logger.info(f"Successfully connected to Redis.") @@ -121,16 +145,65 @@ class ProfileManager: """Get Redis key for a profile's pending downloads counter.""" return f"{self.key_prefix}downloads_pending:{profile_name}" + def _activation_log_key(self) -> str: + """Get Redis key for the activation event log.""" + return f"{self.key_prefix}log:activations" + + def log_activation_event(self, event_data: Dict[str, Any]): + """Logs a profile activation event to a capped list in Redis.""" + key = self._activation_log_key() + try: + # Serialize the event data to a JSON string + event_json = json.dumps(event_data, default=str) + + pipe = self.redis.pipeline() + # Prepend the new event to the list + pipe.lpush(key, event_json) + # Trim the list to keep only the most recent 20 entries + pipe.ltrim(key, 0, 19) + pipe.execute() + logger.debug(f"Logged activation event: {event_json}") + except (TypeError, redis.RedisError) as e: + logger.error(f"Failed to log activation event: {e}") + + def get_activation_events(self, count: int = 10) -> List[Dict[str, Any]]: + """Retrieves the most recent activation events from Redis.""" + key = self._activation_log_key() + try: + events_json = self.redis.lrange(key, 0, count - 1) + events = [json.loads(e) for e in events_json] + return events + except (TypeError, json.JSONDecodeError, redis.RedisError) as e: + logger.error(f"Failed to retrieve or parse activation events: {e}") + return [] + def increment_pending_downloads(self, profile_name: str, count: int = 1) -> Optional[int]: - """Atomically increments the pending downloads counter for a profile.""" - if count <= 0: + """Atomically increments (or decrements if count is negative) the pending downloads counter for a profile.""" + if count == 0: return None + key = self._pending_downloads_key(profile_name) + + # When decrementing, ensure the counter exists to avoid creating negative counters from stray calls. + if count < 0 and not self.redis.exists(key): + logger.warning(f"Attempted to decrement pending downloads for '{profile_name}' by {abs(count)}, but no counter exists. No action taken.") + return None + new_value = self.redis.incrby(key, count) - # Set a TTL on the key to prevent it from living forever if something goes wrong. - # 5 hours is a safe buffer for the 4-hour info.json validity. - self.redis.expire(key, 5 * 3600) - logger.info(f"Incremented pending downloads for '{profile_name}' by {count}. New count: {new_value}") + + if count > 0: + # Set/refresh TTL on positive increments to prevent it from living forever. + # 5 hours is a safe buffer for the 4-hour info.json validity. + self.redis.expire(key, 5 * 3600) + logger.info(f"Incremented pending downloads for '{profile_name}' by {count}. New count: {new_value}") + elif count < 0: + logger.info(f"Decremented pending downloads for '{profile_name}' by {abs(count)}. New count: {new_value}") + + if new_value <= 0: + # Clean up the key if it drops to or below zero. + self.redis.delete(key) + logger.info(f"Pending downloads for '{profile_name}' reached zero or less. Cleared counter key.") + return new_value def decrement_pending_downloads(self, profile_name: str) -> Optional[int]: @@ -187,8 +260,11 @@ class ProfileManager: """Get Redis key for the failed lock attempts counter.""" return f"{self.key_prefix}stats:failed_lock_attempts" - def create_profile(self, name: str, proxy: str, initial_state: str = STATE_ACTIVE) -> bool: + def create_profile(self, name: str, proxy: str, initial_state: str = ProfileState.ACTIVE.value) -> bool: """Create a new profile.""" + # Normalize to uppercase + initial_state = initial_state.upper() + if initial_state not in self.VALID_STATES: logger.error(f"Invalid initial state: {initial_state}") return False @@ -339,87 +415,51 @@ class ProfileManager: profiles.append(data) - # Sort by creation time (newest first) - profiles.sort(key=lambda x: x.get('created_at', 0), reverse=True) + # Sort by natural name order + profiles.sort(key=lambda p: natural_sort_key(p.get('name', ''))) return profiles def update_profile_state(self, name: str, new_state: str, reason: str = '') -> bool: - """Update profile state.""" - if new_state not in self.VALID_STATES: + """Update profile state by triggering a state machine transition.""" + # Normalize to uppercase + new_state = new_state.upper() + + if new_state not in ProfileState.values(): logger.error(f"Invalid state: {new_state}") return False - profile = self.get_profile(name) - if not profile: - logger.error(f"Profile '{name}' not found") - return False + sm = self.get_state_machine(name) + if not sm: + return False # get_state_machine logs the error - old_state = profile['state'] - if old_state == new_state: - logger.info(f"Profile '{name}' already in state {new_state}") + if sm.current_state.value == new_state: + logger.info(f"Profile '{name}' already in state {new_state}. No action taken.") return True - - now = time.time() - profile_key = self._profile_key(name) - - pipe = self.redis.pipeline() - - # Update profile hash - updates = {'state': new_state, 'last_used': str(now)} - - if new_state == self.STATE_BANNED and reason: - updates['ban_reason'] = reason - elif new_state == self.STATE_RESTING: - # Set rest_until to 1 hour from now by default - rest_until = now + 3600 - updates['rest_until'] = str(rest_until) - if reason: - updates['rest_reason'] = reason - - # Handle transitions into ACTIVE state - if new_state == self.STATE_ACTIVE: - # Clear any resting/banned state fields - updates['rest_until'] = '0' - updates['rest_reason'] = '' - updates['reason'] = '' - updates['ban_reason'] = '' # Clear ban reason on manual activation - if old_state in [self.STATE_RESTING, self.STATE_COOLDOWN]: - updates['last_rest_timestamp'] = str(now) - - # When activating a profile, ensure its proxy is also active. - proxy_url = profile.get('proxy') - if proxy_url: - logger.info(f"Activating associated proxy '{proxy_url}' for profile '{name}'.") - pipe.hset(self._proxy_state_key(proxy_url), mapping={ - 'state': self.STATE_ACTIVE, - 'rest_until': '0', - 'work_start_timestamp': str(now) - }) - - # If moving to any state that is not LOCKED, ensure any stale lock data is cleared. - # This makes manual state changes (like 'activate' or 'unban') more robust. - if new_state != self.STATE_LOCKED: - updates['lock_owner'] = '' - updates['lock_timestamp'] = '0' - pipe.hdel(self._locks_key(), name) - if old_state == self.STATE_LOCKED: - logger.info(f"Profile '{name}' was in LOCKED state. Clearing global lock.") - pipe.hset(profile_key, mapping=updates) - - # Remove from old state index, add to new state index - if old_state in self.VALID_STATES: - pipe.zrem(self._state_key(old_state), name) - pipe.zadd(self._state_key(new_state), {name: now}) - - result = pipe.execute() - - logger.info(f"Updated profile '{name}' from {old_state} to {new_state}") - if reason: - logger.info(f"Reason: {reason}") - - return True + try: + if new_state == ProfileState.ACTIVE.value: + sm.activate() + elif new_state == ProfileState.BANNED.value: + sm.ban(reason=reason) + elif new_state == ProfileState.RESTING.value: + sm.rest(reason=reason) + elif new_state == ProfileState.PAUSED.value: + sm.pause(reason=reason) + # LOCKED and COOLDOWN are not handled here as they are special transitions + # from lock_profile and unlock_profile, and should not be set directly. + elif new_state in [ProfileState.LOCKED.value, ProfileState.COOLDOWN.value]: + logger.error(f"Manual state transition to '{new_state}' is not allowed. Use lock_profile() or unlock_profile().") + return False + else: + # This case should not be reached if ProfileState.values() is correct + logger.error(f"State transition to '{new_state}' is not implemented in update_profile_state.") + return False + + return True + except Exception as e: + logger.error(f"Failed to update profile '{name}' from {sm.current_state.id} to '{new_state}': {e}", exc_info=True) + return False def update_profile_field(self, name: str, field: str, value: str) -> bool: """Update a specific field in profile.""" @@ -449,7 +489,7 @@ class ProfileManager: pipe.delete(profile_key) # Remove from state index - if state in self.VALID_STATES: + if state in ProfileState.values(): pipe.zrem(self._state_key(state), name) # Delete activity keys @@ -581,7 +621,7 @@ class ProfileManager: """Get the total count of failed lock attempts from Redis.""" count = self.redis.get(self._failed_lock_attempts_key()) return int(count) if count else 0 - + def get_global_stats(self) -> Dict[str, int]: """Get aggregated global stats across all profiles.""" profiles = self.list_profiles() @@ -629,7 +669,7 @@ class ProfileManager: def set_proxy_state(self, proxy_url: str, state: str, rest_duration_minutes: Optional[int] = None) -> bool: """Set the state of a proxy and propagates it to associated profiles.""" - if state not in [self.STATE_ACTIVE, self.STATE_RESTING]: + if state not in [ProfileState.ACTIVE.value, ProfileState.RESTING.value]: logger.error(f"Invalid proxy state: {state}. Only ACTIVE and RESTING are supported for proxies.") return False @@ -638,7 +678,7 @@ class ProfileManager: updates = {'state': state} rest_until = 0 - if state == self.STATE_RESTING: + if state == ProfileState.RESTING.value: if not rest_duration_minutes or rest_duration_minutes <= 0: logger.error("rest_duration_minutes is required when setting proxy state to RESTING.") return False @@ -657,17 +697,17 @@ class ProfileManager: if not profiles_on_proxy: return True - if state == self.STATE_RESTING: + if state == ProfileState.RESTING.value: logger.info(f"Propagating RESTING state to profiles on proxy '{proxy_url}'.") for profile in profiles_on_proxy: - if profile['state'] == self.STATE_ACTIVE: - self.update_profile_state(profile['name'], self.STATE_RESTING, "Proxy resting") + if profile['state'] == ProfileState.ACTIVE.value: + self.update_profile_state(profile['name'], ProfileState.RESTING.value, "Proxy resting") self.update_profile_field(profile['name'], 'rest_until', str(rest_until)) - elif state == self.STATE_ACTIVE: + elif state == ProfileState.ACTIVE.value: logger.info(f"Propagating ACTIVE state to profiles on proxy '{proxy_url}'.") for profile in profiles_on_proxy: - if profile['state'] == self.STATE_RESTING and profile.get('rest_reason') == "Proxy resting": - self.update_profile_state(profile['name'], self.STATE_ACTIVE, "Proxy activated") + if profile['state'] == ProfileState.RESTING.value and profile.get('rest_reason') == "Proxy resting": + self.update_profile_state(profile['name'], ProfileState.ACTIVE.value, "Proxy activated") return True @@ -700,7 +740,7 @@ class ProfileManager: states[proxy_url] = data else: # Default to ACTIVE if no state is found - states[proxy_url] = {'state': self.STATE_ACTIVE, 'rest_until': 0.0, 'work_start_timestamp': 0.0} + states[proxy_url] = {'state': ProfileState.ACTIVE.value, 'rest_until': 0.0, 'work_start_timestamp': 0.0} return states @@ -787,13 +827,21 @@ class ProfileManager: 'active_profile_index': int, 'rotate_after_requests': int, 'max_active_profiles': int, + 'pending_downloads': int, } + float_fields = ['last_finished_downloads_ts'] for field, type_converter in numeric_fields.items(): if field in data: try: data[field] = type_converter(data[field]) except (ValueError, TypeError): data[field] = 0 + for field in float_fields: + if field in data: + try: + data[field] = float(data[field]) + except (ValueError, TypeError): + data[field] = 0.0 states[group_name] = data else: states[group_name] = {} @@ -812,7 +860,7 @@ class ProfileManager: profiles_to_check = [specific_profile_name] else: # Original logic: find all active profiles, optionally filtered by prefix. - active_profiles = self.redis.zrange(self._state_key(self.STATE_ACTIVE), 0, -1) + active_profiles = self.redis.zrange(self._state_key(ProfileState.ACTIVE.value), 0, -1) if not active_profiles: logger.warning("No active profiles available to lock.") self.redis.incr(self._failed_lock_attempts_key()) @@ -831,7 +879,7 @@ class ProfileManager: full_profiles = [self.get_profile(p) for p in profiles_to_check] # Filter out any None profiles from a race condition with deletion, and ensure state is ACTIVE. # This is especially important when locking a specific profile. - full_profiles = [p for p in full_profiles if p and p.get('proxy') and p.get('state') == self.STATE_ACTIVE] + full_profiles = [p for p in full_profiles if p and p.get('proxy') and p.get('state') == ProfileState.ACTIVE.value] if not full_profiles: if specific_profile_name: @@ -846,7 +894,7 @@ class ProfileManager: eligible_profiles = [ p['name'] for p in full_profiles - if proxy_states.get(p['proxy'], {}).get('state', self.STATE_ACTIVE) == self.STATE_ACTIVE + if proxy_states.get(p['proxy'], {}).get('state', ProfileState.ACTIVE.value) == ProfileState.ACTIVE.value ] if not eligible_profiles: @@ -863,33 +911,40 @@ class ProfileManager: # Try to acquire lock atomically if self.redis.hsetnx(locks_key, name, owner): # Lock acquired. Now, re-check state to avoid race condition with enforcer. - profile_key = self._profile_key(name) - current_state = self.redis.hget(profile_key, 'state') + profile = self.get_profile(name) + if not profile: # Profile might have been deleted in a race condition. + self.redis.hdel(locks_key, name) + continue - if current_state != self.STATE_ACTIVE: + current_state = profile.get('state') + + # Normalize to uppercase for check + if not current_state or current_state.upper() != ProfileState.ACTIVE.value: # Another process (enforcer) changed the state. Release lock and try next. self.redis.hdel(locks_key, name) logger.warning(f"Aborted lock for '{name}'; state changed from ACTIVE to '{current_state}' during lock acquisition.") continue # State is still ACTIVE, proceed with locking. - now = time.time() + sm = self.get_state_machine(name, profile=profile) + if not sm: + # Should not happen if we just checked the profile + self.redis.hdel(locks_key, name) + continue - pipe = self.redis.pipeline() - # Update profile state and lock info - pipe.hset(profile_key, mapping={ - 'state': self.STATE_LOCKED, - 'lock_owner': owner, - 'lock_timestamp': str(now), - 'last_used': str(now) - }) - # Move from ACTIVE to LOCKED state index - pipe.zrem(self._state_key(self.STATE_ACTIVE), name) - pipe.zadd(self._state_key(self.STATE_LOCKED), {name: now}) - pipe.execute() - - logger.info(f"Locked profile '{name}' for owner '{owner}'") - return self.get_profile(name) + try: + # The hsetnx above acquired the global lock. Now we transition the state. + sm.lock(owner=owner) + # The on_enter_locked action handles all Redis updates for the profile itself. + # The logger messages are also in the action. + return self.get_profile(name) + except Exception as e: + # This could be a TransitionNotAllowed error if the state changed, + # or a Redis error during the action. + logger.error(f"Failed to transition profile '{name}' to LOCKED state: {e}", exc_info=True) + # Release the global lock as the state transition failed. + self.redis.hdel(locks_key, name) + continue logger.warning("Could not lock any active profile (all may have been locked by other workers).") self.redis.incr(self._failed_lock_attempts_key()) @@ -902,8 +957,10 @@ class ProfileManager: logger.error(f"Profile '{name}' not found.") return False - if profile['state'] != self.STATE_LOCKED: - logger.warning(f"Profile '{name}' is not in LOCKED state (current: {profile['state']}).") + # Normalize to uppercase for check + current_state = profile.get('state') + if not current_state or current_state.upper() != ProfileState.LOCKED.value: + logger.warning(f"Profile '{name}' is not in LOCKED state (current: {current_state}).") # Forcibly remove from locks hash if it's inconsistent self.redis.hdel(self._locks_key(), name) return False @@ -912,45 +969,93 @@ class ProfileManager: logger.error(f"Owner mismatch: cannot unlock profile '{name}'. Locked by '{profile['lock_owner']}', attempted by '{owner}'.") return False - now = time.time() - profile_key = self._profile_key(name) + sm = self.get_state_machine(name, profile=profile) + if not sm: + return False - pipe = self.redis.pipeline() + try: + if rest_for_seconds and rest_for_seconds > 0: + sm.start_cooldown(duration=rest_for_seconds) + else: + sm.unlock() + return True + except Exception as e: + logger.error(f"Failed to unlock profile '{name}': {e}", exc_info=True) + return False - updates = { - 'lock_owner': '', - 'lock_timestamp': '0', - 'last_used': str(now) - } + def get_state_machine(self, name: str, profile: Optional[Dict[str, Any]] = None) -> Optional[ProfileStateMachine]: + """ + Initializes and returns a ProfileStateMachine instance for a given profile, + set to its current state from Redis. + If `profile` object is not provided, it will be fetched from Redis. + """ + if profile is None: + profile = self.get_profile(name) + + if not profile: + logger.error(f"Cannot create state machine for non-existent profile '{name}'") + return None - if rest_for_seconds and rest_for_seconds > 0: - new_state = self.STATE_COOLDOWN - rest_until = now + rest_for_seconds - updates['rest_until'] = str(rest_until) - updates['rest_reason'] = 'Post-task cooldown' - logger_msg = f"Unlocked profile '{name}' into COOLDOWN for {rest_for_seconds}s." - else: - new_state = self.STATE_ACTIVE - # Clear any rest-related fields when moving to ACTIVE - updates['rest_until'] = '0' - updates['rest_reason'] = '' - updates['reason'] = '' - logger_msg = f"Unlocked profile '{name}'" + current_state_str = profile.get('state') + if not current_state_str: + logger.error(f"Profile '{name}' has no state. Cannot initialize state machine.") + return None - updates['state'] = new_state - pipe.hset(profile_key, mapping=updates) + # Normalize to uppercase to handle potential inconsistencies (e.g. "locked" vs "LOCKED") + current_state_str = current_state_str.upper() - # Move from LOCKED to the new state index - pipe.zrem(self._state_key(self.STATE_LOCKED), name) - pipe.zadd(self._state_key(new_state), {name: now}) + if current_state_str not in self.VALID_STATES: + logger.error(f"Profile '{name}' has an invalid state value '{current_state_str}' in Redis. Cannot initialize state machine.") + return None + + # The `model` parameter in the StateMachine constructor is where we can pass + # context. We pass the manager and profile name. + # We convert the Redis state (uppercase value) to the state machine identifier (lowercase attribute name). - # Remove from global locks hash - pipe.hdel(self._locks_key(), name) + # When re-hydrating a state machine from a stored state, we don't want to re-trigger + # the `on_enter` actions for the current state. We can suppress the initial transition + # and set the state directly. - pipe.execute() + # WORKAROUND for older statemachine library: + # Instantiating the machine triggers an initial transition to ACTIVE, which wrongly updates Redis. + # We let this happen, and then immediately correct the state if it was supposed to be something else. + sm = ProfileStateMachine(manager=self, profile_name=name) - logger.info(logger_msg) - return True + # The sm is now in ACTIVE state, and Redis has been updated. If the original state was + # LOCKED, we must re-lock it to fix Redis and the state machine object so transitions work. + if current_state_str == ProfileState.LOCKED.value: + lock_owner = profile.get('lock_owner', 're-lock-owner') + try: + # This transition ensures the `on_enter_LOCKED` actions are run, making the + # state consistent in Redis and in the state machine object. + sm.lock(owner=lock_owner) + except Exception as e: + logger.error(f"Failed to re-lock profile '{name}' during state machine hydration: {e}") + # The state is now inconsistent, best to not return a broken machine. + return None + elif current_state_str != sm.current_state.value.upper(): + # For any other state, we must manually fix both the state machine object and Redis, + # as the constructor wrongly transitioned to ACTIVE. + + # 1. Force state on the machine object. This does not trigger actions. + target_state_obj = next((s for s in sm.states if s.value.upper() == current_state_str), None) + if not target_state_obj: + logger.error(f"Could not find state object for '{current_state_str}' during hydration of '{name}'.") + return None + sm.current_state = target_state_obj + + # 2. Manually revert the state in Redis to what it should be. + profile_key = self._profile_key(name) + pipe = self.redis.pipeline() + pipe.hset(profile_key, 'state', current_state_str) + # Atomically move the profile from the incorrect ACTIVE index to the correct one. + # The constructor may have added it to ACTIVE without removing it from its original state index. + pipe.zrem(self._state_key(ProfileState.ACTIVE.value), name) + pipe.zadd(self._state_key(current_state_str), {name: profile.get('last_used', time.time())}) + pipe.execute() + logger.debug(f"Corrected state for '{name}' to '{current_state_str}' in object and Redis during hydration.") + + return sm def cleanup_stale_locks(self, max_lock_time_seconds: int) -> int: """Find and unlock profiles with stale locks.""" @@ -972,6 +1077,21 @@ class ProfileManager: cleaned_count += 1 continue + # --- NEW: Check for inconsistent locks on ACTIVE profiles --- + # A lock should not exist for a profile that is still ACTIVE, except for the + # milliseconds between when a worker acquires the lock and when it transitions + # the profile state to LOCKED. If the policy enforcer sees this state, it's + # almost certainly a stale lock from a crashed worker. + if profile.get('state') == ProfileState.ACTIVE.value: + logger.warning( + f"Found inconsistent lock for ACTIVE profile '{name}' (owner: '{owner}'). " + "This indicates a worker may have crashed. Cleaning up stale lock." + ) + self.redis.hdel(locks_key, name) + cleaned_count += 1 + continue + # --- END NEW LOGIC --- + lock_timestamp = profile.get('lock_timestamp', 0) if lock_timestamp > 0 and (now - lock_timestamp) > max_lock_time_seconds: logger.warning(f"Found stale lock for profile '{name}' (locked by '{owner}' for {now - lock_timestamp:.0f}s). Unlocking...") @@ -984,23 +1104,6 @@ class ProfileManager: logger.debug("No stale locks found to clean up.") return cleaned_count -def format_timestamp(ts: float) -> str: - """Format timestamp for display.""" - if not ts or ts == 0: - return "Never" - return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') - -def format_duration(seconds: float) -> str: - """Format duration for display.""" - if seconds < 60: - return f"{seconds:.0f}s" - elif seconds < 3600: - return f"{seconds/60:.1f}m" - elif seconds < 86400: - return f"{seconds/3600:.1f}h" - else: - return f"{seconds/86400:.1f}d" - def add_profile_manager_parser(subparsers): """Adds the parser for the 'profile' command.""" @@ -1016,6 +1119,7 @@ def add_profile_manager_parser(subparsers): common_parser.add_argument('--env-file', help='Path to a .env file to load environment variables from.') common_parser.add_argument('--redis-host', default=None, help='Redis host. Defaults to REDIS_HOST or MASTER_HOST_IP env var, or localhost.') common_parser.add_argument('--redis-port', type=int, default=None, help='Redis port. Defaults to REDIS_PORT env var, or 6379.') + common_parser.add_argument('--redis-db', type=int, default=None, help='Redis DB number. Defaults to REDIS_DB env var, or 0.') common_parser.add_argument('--redis-password', default=None, help='Redis password. Defaults to REDIS_PASSWORD env var.') common_parser.add_argument('--env', default='dev', help="Environment name for Redis key prefix (e.g., 'stg', 'prod'). Used by all non-list commands, and by 'list' for single-view mode. Defaults to 'dev'.") common_parser.add_argument('--legacy', action='store_true', help="Use legacy key prefix ('profile_mgmt_') without environment.") @@ -1028,9 +1132,9 @@ def add_profile_manager_parser(subparsers): create_parser = subparsers.add_parser('create', help='Create a new profile', parents=[common_parser]) create_parser.add_argument('name', help='Profile name') create_parser.add_argument('proxy', help='Proxy URL (e.g., sslocal-rust-1090:1090)') - create_parser.add_argument('--state', default='ACTIVE', - choices=['ACTIVE', 'PAUSED', 'RESTING', 'BANNED', 'COOLDOWN'], - help='Initial state (default: ACTIVE)') + create_parser.add_argument('--state', default=ProfileState.ACTIVE.value, + choices=ProfileState.values(), + help=f'Initial state (default: {ProfileState.ACTIVE.value})') # List command list_parser = subparsers.add_parser('list', help='List profiles', parents=[common_parser]) @@ -1041,6 +1145,8 @@ def add_profile_manager_parser(subparsers): list_parser.add_argument('--state', help='Filter by state') list_parser.add_argument('--proxy', help='Filter by proxy (substring match)') list_parser.add_argument('--show-proxy-activity', action='store_true', help='Show a detailed activity summary table for proxies. If --proxy is specified, shows details for that proxy only. Otherwise, shows a summary for all proxies.') + list_parser.add_argument('--show-reasons', action='store_true', help='Show detailed reasons for group and profile selection states.') + list_parser.add_argument('--show-activation-history', action='store_true', help='Show the recent profile activation history.') list_parser.add_argument('--format', choices=['table', 'json', 'csv'], default='table', help='Output format (default: table)') list_parser.add_argument('--live', action='store_true', help='Run continuously with a non-blinking live-updating display.') @@ -1061,7 +1167,7 @@ def add_profile_manager_parser(subparsers): # Update state command update_state_parser = subparsers.add_parser('update-state', help='Update profile state', parents=[common_parser]) update_state_parser.add_argument('name', help='Profile name') - update_state_parser.add_argument('state', choices=['ACTIVE', 'PAUSED', 'RESTING', 'BANNED', 'LOCKED', 'COOLDOWN'], + update_state_parser.add_argument('state', choices=ProfileState.values(), help='New state') update_state_parser.add_argument('--reason', help='Reason for state change (especially for BAN)') @@ -1072,20 +1178,20 @@ def add_profile_manager_parser(subparsers): update_field_parser.add_argument('value', help='New value') # Pause command (convenience) - pause_parser = subparsers.add_parser('pause', help='Pause a profile (sets state to PAUSED).', parents=[common_parser]) + pause_parser = subparsers.add_parser('pause', help=f'Pause a profile (sets state to {ProfileState.PAUSED.value}).', parents=[common_parser]) pause_parser.add_argument('name', help='Profile name') # Activate command (convenience) - activate_parser = subparsers.add_parser('activate', help='Activate a profile (sets state to ACTIVE). Useful for resuming a PAUSED profile or fixing a stale LOCKED one.', parents=[common_parser]) + activate_parser = subparsers.add_parser('activate', help=f'Activate a profile (sets state to {ProfileState.ACTIVE.value}). Useful for resuming a PAUSED profile or fixing a stale LOCKED one.', parents=[common_parser]) activate_parser.add_argument('name', help='Profile name') # Ban command (convenience) - ban_parser = subparsers.add_parser('ban', help='Ban a profile (sets state to BANNED).', parents=[common_parser]) + ban_parser = subparsers.add_parser('ban', help=f'Ban a profile (sets state to {ProfileState.BANNED.value}).', parents=[common_parser]) ban_parser.add_argument('name', help='Profile name') ban_parser.add_argument('--reason', required=True, help='Reason for ban') # Unban command (convenience) - unban_parser = subparsers.add_parser('unban', help='Unban a profile (sets state to ACTIVE and resets session counters).', parents=[common_parser]) + unban_parser = subparsers.add_parser('unban', help=f'Unban a profile (sets state to {ProfileState.ACTIVE.value} and resets session counters).', parents=[common_parser]) unban_parser.add_argument('name', help='Profile name') # Delete command @@ -1329,7 +1435,7 @@ def _render_proxy_activity_summary(manager, proxy_url, simulation_type, file=sys print(tabulate(table_data, headers=headers, tablefmt='grid'), file=file) -def _render_profile_group_summary_table(manager, all_profiles, profile_groups_config, file=sys.stdout): +def _render_profile_group_summary_table(manager, all_profiles, profile_groups_config, args, file=sys.stdout): """Renders a summary table for profile groups.""" if not profile_groups_config: return @@ -1337,6 +1443,69 @@ def _render_profile_group_summary_table(manager, all_profiles, profile_groups_co print("\nProfile Group Status:", file=file) table_data = [] all_profiles_map = {p['name']: p for p in all_profiles} + + # --- New logic to determine the next group to be activated --- + profile_selection_strategy = manager.get_config('profile_selection_strategy') + next_up_group_name = None + next_up_reason = "" + + if profile_selection_strategy and profile_groups_config: + # This logic mirrors the enforcer's selection process for display purposes. + # It determines which group is likely to have its profile activated next. + now = time.time() + all_profiles_by_name = {p['name']: p for p in all_profiles} + + if profile_selection_strategy == 'least_loaded': + sorted_groups = sorted( + profile_groups_config, + key=lambda g: ( + g.get('pending_downloads', 0), + g.get('last_finished_downloads_ts', float('inf')), + g.get('name', '') + ) + ) + if sorted_groups: + next_up_group = sorted_groups[0] + next_up_group_name = next_up_group['name'] + next_up_reason = profile_selection_strategy + if getattr(args, 'show_reasons', False): + load = next_up_group.get('pending_downloads', 0) + finish_ts = next_up_group.get('last_finished_downloads_ts', 0) + finish_str = f"finished {format_duration(time.time() - finish_ts)} ago" if finish_ts > 0 else "never finished" + next_up_reason = f"least_loaded (load: {load}, {finish_str})" + + elif profile_selection_strategy == 'longest_idle': + # Find the single longest idle profile across all groups + ready_profiles = [] + for group in profile_groups_config: + for p_name in group.get('profiles_in_group', []): + p = all_profiles_by_name.get(p_name) + if p and p['state'] in [ProfileState.RESTING.value, ProfileState.COOLDOWN.value] and p.get('rest_until', 0) <= now and p.get('rest_reason') != 'waiting_downloads': + ready_profiles.append(p) + + if ready_profiles: + # Sort them according to the 'longest_idle' activation logic + unused_profiles = [p for p in ready_profiles if (p.get('success_count', 0) + p.get('failure_count', 0) + p.get('tolerated_error_count', 0) + p.get('download_count', 0) + p.get('download_error_count', 0)) == 0] + used_profiles = [p for p in ready_profiles if p not in unused_profiles] + + unused_profiles.sort(key=lambda p: natural_sort_key(p.get('name', ''))) + used_profiles.sort(key=lambda p: (p.get('last_used', 0), natural_sort_key(p.get('name', '')))) + + sorted_ready_profiles = unused_profiles + used_profiles + + if sorted_ready_profiles: + next_profile = sorted_ready_profiles[0] + # Find which group it belongs to + for group in profile_groups_config: + if next_profile['name'] in group.get('profiles_in_group', []): + next_up_group_name = group['name'] + next_up_reason = profile_selection_strategy + if getattr(args, 'show_reasons', False): + last_used_ts = next_profile.get('last_used', 0) + idle_time_str = f"idle for {format_duration(time.time() - last_used_ts)}" if last_used_ts > 0 else "never used" + next_up_reason = f"longest_idle (via {next_profile['name']}, {idle_time_str})" + break + # --- End new logic --- for group in profile_groups_config: group_name = group.get('name', 'N/A') @@ -1344,7 +1513,7 @@ def _render_profile_group_summary_table(manager, all_profiles, profile_groups_co active_profiles = [ p_name for p_name in profiles_in_group - if all_profiles_map.get(p_name, {}).get('state') in [manager.STATE_ACTIVE, manager.STATE_LOCKED] + if all_profiles_map.get(p_name, {}).get('state') in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value] ] active_profiles_str = ', '.join(active_profiles) or "None" @@ -1369,15 +1538,68 @@ def _render_profile_group_summary_table(manager, all_profiles, profile_groups_co remaining_reqs = rotate_after - total_reqs reqs_left_str = str(max(0, int(remaining_reqs))) + # Recalculate pending downloads on the fly for display accuracy + pending_downloads = sum( + all_profiles_map.get(p_name, {}).get('pending_downloads', 0) + for p_name in profiles_in_group + ) + + selection_priority_str = "" + if group_name == next_up_group_name: + selection_priority_str = f"<- Next Up ({next_up_reason})" + + time_since_finish_str = "N/A" + last_finish_ts = group.get('last_finished_downloads_ts', 0) + if last_finish_ts > 0: + time_since_finish_str = format_duration(time.time() - last_finish_ts) + table_data.append([ group_name, active_profiles_str, policy_str, rotation_rule_str, - reqs_left_str + reqs_left_str, + pending_downloads, + time_since_finish_str, + selection_priority_str ]) - headers = ['Group Name', 'Active Profile(s)', 'Policy', 'Rotation Rule', 'Requests Left ↓'] + headers = ['Group Name', 'Active Profile(s)', 'Policy', 'Rotation Rule', 'Requests Left ↓', 'Pending DLs', 'Time Since Finish', 'Selection Priority'] + print(tabulate(table_data, headers=headers, tablefmt='grid'), file=file) + + +def _render_activation_history_table(manager, file=sys.stdout): + """Renders a table of the most recent profile activation events.""" + if not manager: + return + + # Fetch more events to ensure we have enough after filtering. + # The log is capped at 20 entries in Redis. + events = manager.get_activation_events(count=20) + + # Filter out non-activation events and take the most recent 10. + filtered_events = [ + e for e in events if e.get('reason') != 'Rest/Cooldown completed' + ][:10] + + if not filtered_events: + # Don't print the header if there's nothing to show. + return + + print("\nRecent Profile Activations:", file=file) + table_data = [] + for event in filtered_events: + ts = event.get('ts', 0) + time_str = format_timestamp(ts) if ts > 0 else "N/A" + + table_data.append([ + time_str, + event.get('profile', 'N/A'), + event.get('group', 'N/A'), + event.get('reason', 'N/A') + ]) + + headers = ['Time', 'Profile', 'Group', 'Reason'] print(tabulate(table_data, headers=headers, tablefmt='grid'), file=file) @@ -1428,7 +1650,7 @@ def _render_profile_details_table(manager, args, simulation_type, profile_groups elif args.rest_after_requests and args.rest_after_requests > 0: rotate_after = args.rest_after_requests - if rotate_after > 0 and state_str != manager.STATE_COOLDOWN: + if rotate_after > 0 and state_str != ProfileState.COOLDOWN.value: total_reqs = ( p.get('success_count', 0) + p.get('failure_count', 0) + p.get('tolerated_error_count', 0) + @@ -1470,7 +1692,8 @@ def _render_profile_details_table(manager, args, simulation_type, profile_groups countdown_str, rest_until_str, reason_str, - p.get('ban_reason') or '' + p.get('ban_reason') or '', + p.get('pending_downloads', 0) ]) table_data.append(row) @@ -1481,7 +1704,7 @@ def _render_profile_details_table(manager, args, simulation_type, profile_groups else: # is_download_sim or unknown headers.extend(['DataOK', 'DownFail', 'Skip.Err', 'Tot.DataOK', 'Tot.DownFail']) - headers.extend(['ReqCD ↓', 'RestCD ↓', 'R.Reason', 'B.Reason']) + headers.extend(['ReqCD ↓', 'RestCD ↓', 'R.Reason', 'B.Reason', 'Pend.DLs']) # Using `maxcolwidths` to control column width for backward compatibility # with older versions of the `tabulate` library. This prevents content @@ -1518,7 +1741,6 @@ def _render_simulation_view(title, manager, args, file=sys.stdout): print("'tabulate' library is required for table format. Please install it.", file=sys.stderr) return 1 - print(f"\n--- {title} ---", file=file) profiles = manager.list_profiles(args.state, args.proxy) if args.format == 'json': @@ -1533,6 +1755,7 @@ def _render_simulation_view(title, manager, args, file=sys.stdout): return 0 # --- Table Format with Summaries --- + print(f"\n--- {title} ---", file=file) if args.show_proxy_activity: if args.proxy: @@ -1541,7 +1764,12 @@ def _render_simulation_view(title, manager, args, file=sys.stdout): _render_all_proxies_activity_summary(manager, title, file=file) profile_groups_config = _build_profile_groups_config(manager, profiles) - _render_profile_group_summary_table(manager, profiles, profile_groups_config, file=file) + + profile_selection_strategy = manager.get_config('profile_selection_strategy') + if profile_selection_strategy: + print(f"Profile Selection Strategy: {profile_selection_strategy}", file=file) + + _render_profile_group_summary_table(manager, profiles, profile_groups_config, args, file=file) failed_lock_attempts = manager.get_failed_lock_attempts() global_stats = manager.get_global_stats() @@ -1605,6 +1833,9 @@ def _render_simulation_view(title, manager, args, file=sys.stdout): print("\nProfile Details:", file=file) _render_profile_details_table(manager, args, title, profile_groups_config, file=file) + + if args.show_activation_history: + _render_activation_history_table(manager, file=file) return 0 @@ -1689,12 +1920,19 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout): print(tabulate(proxy_table_data, headers=proxy_headers, tablefmt='grid'), file=file) print(f"\n--- Auth Simulation Profile Details ({args.auth_env}) ---", file=file) - _render_profile_group_summary_table(auth_manager, auth_profiles, auth_groups_config, file=file) + profile_selection_strategy = auth_manager.get_config('profile_selection_strategy') + if profile_selection_strategy: + print(f"Profile Selection Strategy: {profile_selection_strategy}", file=file) + _render_profile_group_summary_table(auth_manager, auth_profiles, auth_groups_config, args, file=file) _render_profile_details_table(auth_manager, args, "Auth", auth_groups_config, file=file) + if args.show_activation_history: + _render_activation_history_table(auth_manager, file=file) print(f"\n--- Download Simulation Profile Details ({args.download_env}) ---", file=file) - _render_profile_group_summary_table(download_manager, dl_profiles, dl_groups_config, file=file) + _render_profile_group_summary_table(download_manager, dl_profiles, dl_groups_config, args, file=file) _render_profile_details_table(download_manager, args, "Download", dl_groups_config, file=file) + if args.show_activation_history: + _render_activation_history_table(download_manager, file=file) return 0 @@ -1709,7 +1947,7 @@ def main_profile_manager(args): if load_dotenv: env_file = args.env_file if not env_file and args.env and '.env' in args.env and os.path.exists(args.env): - print(f"WARNING: --env should be an environment name (e.g., 'dev'), not a file path. Treating '{args.env}' as --env-file. The environment name will default to 'dev'.", file=sys.stderr) + print(f"Warning: --env should be an environment name (e.g., 'dev'), not a file path. Treating '{args.env}' as --env-file. The environment name will default to 'dev'.", file=sys.stderr) env_file = args.env args.env = 'dev' @@ -1724,6 +1962,8 @@ def main_profile_manager(args): args.redis_host = os.getenv('REDIS_HOST', os.getenv('MASTER_HOST_IP', 'localhost')) if args.redis_port is None: args.redis_port = int(os.getenv('REDIS_PORT', 6379)) + if getattr(args, 'redis_db', None) is None: + args.redis_db = int(os.getenv('REDIS_DB', 0)) if args.redis_password is None: args.redis_password = os.getenv('REDIS_PASSWORD') @@ -1741,7 +1981,8 @@ def main_profile_manager(args): redis_host=args.redis_host, redis_port=args.redis_port, redis_password=args.redis_password, - key_prefix=key_prefix + key_prefix=key_prefix, + redis_db=args.redis_db ) if args.profile_command == 'create': @@ -1770,7 +2011,8 @@ def main_profile_manager(args): return ProfileManager( redis_host=args.redis_host, redis_port=args.redis_port, - redis_password=args.redis_password, key_prefix=key_prefix + redis_password=args.redis_password, key_prefix=key_prefix, + redis_db=args.redis_db ) if not args.live: @@ -1924,20 +2166,20 @@ def main_profile_manager(args): return 0 if success else 1 elif args.profile_command == 'pause': - success = manager.update_profile_state(args.name, manager.STATE_PAUSED, 'Manual pause') + success = manager.update_profile_state(args.name, ProfileState.PAUSED.value, 'Manual pause') return 0 if success else 1 elif args.profile_command == 'activate': - success = manager.update_profile_state(args.name, manager.STATE_ACTIVE, 'Manual activation') + success = manager.update_profile_state(args.name, ProfileState.ACTIVE.value, 'Manual activation') return 0 if success else 1 elif args.profile_command == 'ban': - success = manager.update_profile_state(args.name, manager.STATE_BANNED, args.reason) + success = manager.update_profile_state(args.name, ProfileState.BANNED.value, args.reason) return 0 if success else 1 elif args.profile_command == 'unban': # First activate, then reset session counters. The ban reason is cleared by update_profile_state. - success = manager.update_profile_state(args.name, manager.STATE_ACTIVE, 'Manual unban') + success = manager.update_profile_state(args.name, ProfileState.ACTIVE.value, 'Manual unban') if success: manager.reset_profile_counters(args.name) return 0 if success else 1 diff --git a/ytops_client-source/ytops_client/profile_statemachine.py b/ytops_client-source/ytops_client/profile_statemachine.py new file mode 100644 index 0000000..e3cf7d6 --- /dev/null +++ b/ytops_client-source/ytops_client/profile_statemachine.py @@ -0,0 +1,330 @@ +from __future__ import annotations +from enum import Enum +from typing import TYPE_CHECKING, Optional +import time +import logging + +from statemachine import StateMachine, State, event + +if TYPE_CHECKING: + from .profile_manager_tool import ProfileManager + + +logger = logging.getLogger(__name__) + + +class ProfileState(Enum): + """Enumeration for profile states.""" + ACTIVE = "ACTIVE" + LOCKED = "LOCKED" + RESTING = "RESTING" + BANNED = "BANNED" + COOLDOWN = "COOLDOWN" + PAUSED = "PAUSED" + + @classmethod + def values(cls): + return [item.value for item in cls] + + +class ProfileStateMachine(StateMachine): + """A state machine for managing the lifecycle of a profile.""" + + # States + # We use lowercase attribute names to match the on_enter_* callback convention (e.g., on_enter_active). + # The value passed to State() is the string stored in Redis (uppercase). + active = State("ACTIVE", initial=True) + locked = State("LOCKED") + resting = State("RESTING") + banned = State("BANNED") + cooldown = State("COOLDOWN") + paused = State("PAUSED") + + # Transitions + lock = active.to(locked) + unlock = locked.to(active) + start_cooldown = locked.to(cooldown) + + rest = active.to(resting) | locked.to(resting) | cooldown.to(resting) + ban = active.to(banned) | locked.to(banned) | resting.to(banned) | cooldown.to(banned) | paused.to(banned) + activate = resting.to(active) | banned.to(active) | cooldown.to(active) | paused.to(active) + + pause = active.to(paused) | locked.to(paused) | resting.to(paused) | cooldown.to(paused) + + def __init__(self, manager: ProfileManager, profile_name: str, *args, **kwargs): + self.manager = manager + self.profile_name = profile_name + super().__init__(*args, **kwargs) + + # --- Action Methods --- + + def on_enter_locked(self, event_data: event.EventData, owner: Optional[str] = None): + """Action executed when entering the LOCKED state.""" + # When re-hydrating a state machine, `owner` will be None. In this case, + # the profile is already locked, so we should not perform any actions. + if owner is None: + return + + now = time.time() + profile_key = self.manager._profile_key(self.profile_name) + pipe = self.manager.redis.pipeline() + + # Explicitly use Enum value to ensure Uppercase in Redis + state_val = ProfileState.LOCKED.value + + pipe.hset(profile_key, mapping={ + 'state': state_val, + 'lock_owner': owner, + 'lock_timestamp': str(now), + 'last_used': str(now) + }) + + # Update state indexes + if event_data.source: + pipe.zrem(self.manager._state_key(event_data.source.value), self.profile_name) + pipe.zadd(self.manager._state_key(state_val), {self.profile_name: now}) + + pipe.execute() + + logger.info(f"Updated profile '{self.profile_name}' from {event_data.source.value if event_data.source else 'None'} to {state_val}") + logger.info(f"Locked profile '{self.profile_name}' for owner '{owner}'") + + def on_enter_cooldown(self, event_data: event.EventData, duration: Optional[int] = None): + """Action executed when entering the COOLDOWN state.""" + # When re-hydrating a state machine, `duration` will be None. In this case, + # the profile is already in cooldown, so we should not perform any actions. + if duration is None: + return + + now = time.time() + rest_until = now + duration + profile_key = self.manager._profile_key(self.profile_name) + pipe = self.manager.redis.pipeline() + + # Explicitly use Enum value to ensure Uppercase in Redis + state_val = ProfileState.COOLDOWN.value + + pipe.hset(profile_key, mapping={ + 'state': state_val, + 'rest_until': str(rest_until), + 'rest_reason': 'Post-task cooldown', + 'lock_owner': '', + 'lock_timestamp': '0', + 'last_used': str(now) + }) + + # Update state indexes + if event_data.source: + pipe.zrem(self.manager._state_key(event_data.source.value), self.profile_name) + pipe.zadd(self.manager._state_key(state_val), {self.profile_name: now}) + + # Remove from locks + pipe.hdel(self.manager._locks_key(), self.profile_name) + + pipe.execute() + + logger.info(f"Updated profile '{self.profile_name}' from {event_data.source.value if event_data.source else 'None'} to {state_val}") + logger.info(f"Unlocked profile '{self.profile_name}' into COOLDOWN for {duration}s.") + + def on_enter_active(self, event_data: event.EventData, profile: Optional[dict] = None): + """Action executed when entering the ACTIVE state.""" + now = time.time() + source_state = event_data.source + profile_key = self.manager._profile_key(self.profile_name) + pipe = self.manager.redis.pipeline() + + # Explicitly use Enum value to ensure Uppercase in Redis + state_val = ProfileState.ACTIVE.value + + updates = { + 'state': state_val, + 'rest_until': '0', + 'rest_reason': '', + 'reason': '', + 'ban_reason': '', + 'lock_owner': '', + 'lock_timestamp': '0', + 'last_used': str(now), + 'wait_started_at': '0' + } + + if source_state and source_state.value in [ProfileState.RESTING.value, ProfileState.COOLDOWN.value]: + updates['last_rest_timestamp'] = str(now) + + # Reset counters if activating from a long-term off state, but not from a short cooldown. + should_reset_counters = False + if source_state and source_state.value in [ProfileState.BANNED.value, ProfileState.PAUSED.value]: + should_reset_counters = True + elif source_state and source_state.value == ProfileState.RESTING.value: + # For RESTING, only reset if it wasn't just waiting for a slot after a cooldown. + profile_data = profile or self.manager.redis.hgetall(profile_key) + is_waiting_after_cooldown = profile_data.get('rest_reason') == "Waiting for group capacity" + if not is_waiting_after_cooldown: + should_reset_counters = True + + if should_reset_counters: + logger.info(f"Resetting session counters for profile '{self.profile_name}' on activation.") + updates.update({ + 'success_count': '0', + 'failure_count': '0', + 'tolerated_error_count': '0', + 'download_count': '0', + 'download_error_count': '0', + }) + + pipe.hset(profile_key, mapping=updates) + + # Update state indexes - this is critical for list_profiles to work correctly + if source_state: + pipe.zrem(self.manager._state_key(source_state.value), self.profile_name) + pipe.zadd(self.manager._state_key(state_val), {self.profile_name: now}) + + # Remove from locks if coming from LOCKED state + if source_state and source_state.value == ProfileState.LOCKED.value: + pipe.hdel(self.manager._locks_key(), self.profile_name) + + # When activating a profile, ensure its proxy is also active. + proxy_url = self.manager.redis.hget(profile_key, 'proxy') + if proxy_url: + logger.debug(f"Ensuring associated proxy '{proxy_url}' is active for profile '{self.profile_name}'.") + pipe.hset(self.manager._proxy_state_key(proxy_url), mapping={ + 'state': ProfileState.ACTIVE.value, + 'rest_until': '0', + 'work_start_timestamp': str(now) + }) + + pipe.execute() + logger.info(f"Updated profile '{self.profile_name}' from {source_state.value if source_state else 'None'} to {state_val}") + + def on_enter_banned(self, event_data: event.EventData, reason: str = ''): + """Action executed when entering the BANNED state.""" + now = time.time() + profile_key = self.manager._profile_key(self.profile_name) + pipe = self.manager.redis.pipeline() + + # Explicitly use Enum value to ensure Uppercase in Redis + state_val = ProfileState.BANNED.value + + updates = { + 'state': state_val, + 'last_used': str(now), + 'lock_owner': '', + 'lock_timestamp': '0', + 'rest_until': '0', + 'rest_reason': '', + 'wait_started_at': '0' + } + if reason: + updates['ban_reason'] = reason + updates['reason'] = reason + + pipe.hset(profile_key, mapping=updates) + + # Update state indexes + if event_data.source: + pipe.zrem(self.manager._state_key(event_data.source.value), self.profile_name) + pipe.zadd(self.manager._state_key(state_val), {self.profile_name: now}) + + # Remove from locks + if event_data.source and event_data.source.value == ProfileState.LOCKED.value: + pipe.hdel(self.manager._locks_key(), self.profile_name) + + pipe.execute() + logger.info(f"Updated profile '{self.profile_name}' from {event_data.source.value if event_data.source else 'None'} to {state_val}") + if reason: + logger.info(f"Reason for ban: {reason}") + + def on_enter_resting(self, event_data: event.EventData, reason: str = '', duration_minutes: Optional[int] = None, is_waiting_profile: bool = False, is_rotation: bool = False): + """Action executed when entering the RESTING state.""" + now = time.time() + source_state = event_data.source + profile_key = self.manager._profile_key(self.profile_name) + pipe = self.manager.redis.pipeline() + + # Explicitly use Enum value to ensure Uppercase in Redis + state_val = ProfileState.RESTING.value + + updates = { + 'state': state_val, + 'last_used': str(now), + 'lock_owner': '', + 'lock_timestamp': '0', + } + + if is_waiting_profile: + updates['wait_started_at'] = str(now) + updates['rest_until'] = '0' + elif duration_minutes == 0: + updates['rest_until'] = '0' + else: + # Default to 1 hour if no duration is provided + rest_duration_seconds = (duration_minutes * 60) if duration_minutes is not None else 3600 + rest_until = now + rest_duration_seconds + updates['rest_until'] = str(rest_until) + + if reason: + updates['rest_reason'] = reason + updates['reason'] = reason + + if is_rotation: + logger.info(f"Resetting session counters for profile '{self.profile_name}' on rotation.") + updates.update({ + 'success_count': '0', + 'failure_count': '0', + 'tolerated_error_count': '0', + 'download_count': '0', + 'download_error_count': '0', + }) + + pipe.hset(profile_key, mapping=updates) + + # Update state indexes - this is critical for the enforcer to see the correct state + if source_state: + pipe.zrem(self.manager._state_key(source_state.value), self.profile_name) + pipe.zadd(self.manager._state_key(state_val), {self.profile_name: now}) + + # Remove from locks if coming from LOCKED state + if source_state and source_state.value == ProfileState.LOCKED.value: + pipe.hdel(self.manager._locks_key(), self.profile_name) + + pipe.execute() + logger.info(f"Updated profile '{self.profile_name}' from {source_state.value if source_state else 'None'} to {state_val}") + if reason: + logger.info(f"Reason for rest: {reason}") + + def on_enter_paused(self, event_data: event.EventData, reason: str = ''): + """Action executed when entering the PAUSED state.""" + now = time.time() + profile_key = self.manager._profile_key(self.profile_name) + pipe = self.manager.redis.pipeline() + + # Explicitly use Enum value to ensure Uppercase in Redis + state_val = ProfileState.PAUSED.value + + updates = { + 'state': state_val, + 'last_used': str(now), + 'lock_owner': '', + 'lock_timestamp': '0', + 'rest_until': '0', + 'rest_reason': '', + 'wait_started_at': '0' + } + if reason: + updates['reason'] = reason + + pipe.hset(profile_key, mapping=updates) + + # Update state indexes + if event_data.source: + pipe.zrem(self.manager._state_key(event_data.source.value), self.profile_name) + pipe.zadd(self.manager._state_key(state_val), {self.profile_name: now}) + + # Remove from locks + if event_data.source and event_data.source.value == ProfileState.LOCKED.value: + pipe.hdel(self.manager._locks_key(), self.profile_name) + + pipe.execute() + logger.info(f"Updated profile '{self.profile_name}' from {event_data.source.value if event_data.source else 'None'} to {state_val}") + if reason: + logger.info(f"Reason for pause: {reason}") diff --git a/ytops_client-source/ytops_client/requirements.txt b/ytops_client-source/ytops_client/requirements.txt index fdeedc4..9132043 100644 --- a/ytops_client-source/ytops_client/requirements.txt +++ b/ytops_client-source/ytops_client/requirements.txt @@ -27,7 +27,7 @@ requests==2.32.5 tabulate # For yt-dlp integration in 'download py', 'list-formats', etc. -yt-dlp +# yt-dlp # --- Pinned yt-dlp dependencies --- # These are pinned to match versions known to work with the server. @@ -40,3 +40,6 @@ pycryptodomex==3.23.0 secretstorage==3.4.0 urllib3==2.5.0 websockets==15.0.1 + +python-statemachine +pytest diff --git a/ytops_client-source/ytops_client/stress_policy/direct_batch_worker.py b/ytops_client-source/ytops_client/stress_policy/direct_batch_worker.py new file mode 100644 index 0000000..82a97e8 --- /dev/null +++ b/ytops_client-source/ytops_client/stress_policy/direct_batch_worker.py @@ -0,0 +1,406 @@ +import collections +import json +import logging +import os +import random +import re +import shlex +import sys +import tempfile +import shutil +import subprocess +import threading +import time +from copy import deepcopy +from datetime import datetime, timezone +from pathlib import Path + +from . import utils as sp_utils +from .process_runners import run_command, run_docker_container, get_worker_id +from ..profile_manager_tool import ProfileManager + + +logger = logging.getLogger(__name__) + + +def run_direct_batch_worker(worker_id, policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock, profile_prefix=None): + """A worker for the 'direct_batch_cli' orchestration mode.""" + owner_id = f"direct-batch-worker-{worker_id}" + settings = policy.get('settings', {}) + exec_control = policy.get('execution_control', {}) + gen_policy = policy.get('info_json_generation_policy', {}) + direct_policy = policy.get('direct_batch_cli_policy', {}) + queue_policy = policy.get('queue_policy') + + # Prioritize the passed-in profile_prefix for worker pool compatibility. + if not profile_prefix: + profile_prefix = gen_policy.get('profile_prefix') + + if not profile_prefix: + logger.error(f"[Worker {worker_id}] Direct batch mode requires a 'profile_prefix' from the worker pool or 'info_json_generation_policy'. Worker exiting.") + return [] + + batch_size = direct_policy.get('batch_size') + if not batch_size: + logger.error(f"[Worker {worker_id}] Direct batch mode requires 'direct_batch_cli_policy.batch_size'. Worker exiting.") + return [] + + save_dir = settings.get('save_info_json_dir') + if not save_dir: + logger.error(f"[Worker {worker_id}] Direct batch mode requires 'settings.save_info_json_dir'. Worker exiting.") + return [] + + os.makedirs(save_dir, exist_ok=True) + + last_used_profile_name = None + while not state_manager.shutdown_event.is_set(): + locked_profile = None + temp_batch_file = None + # --- Variables for robust finalization --- + files_created = 0 + url_batch_len = 0 + batch_started = False + downloads_per_url = 0 # Default to 0, meaning no increment unless configured + # --- + try: + # 1. Lock a profile + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) + + # --- New logic to avoid immediate reuse --- + avoid_reuse = direct_policy.get('avoid_immediate_profile_reuse', False) + if avoid_reuse and locked_profile and last_used_profile_name and locked_profile['name'] == last_used_profile_name: + logger.info(f"[Worker {worker_id}] Re-locked same profile '{locked_profile['name']}'. Unlocking and pausing to allow for rotation.") + profile_manager_instance.unlock_profile(locked_profile['name'], owner=owner_id) + + wait_seconds = direct_policy.get('avoid_reuse_max_wait_seconds', 5) + time.sleep(wait_seconds) + + # After waiting, try to lock again. + logger.info(f"[Worker {worker_id}] Attempting to lock a new profile after waiting.") + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) + + if locked_profile and locked_profile['name'] == last_used_profile_name: + logger.warning(f"[Worker {worker_id}] Still locking the same profile '{locked_profile['name']}' after waiting. Proceeding to use it to avoid getting stuck.") + elif locked_profile: + logger.info(f"[Worker {worker_id}] Switched to a different profile after waiting: '{locked_profile['name']}'.") + # --- End new logic --- + + if not locked_profile: + polling_interval = exec_control.get('worker_polling_interval_seconds', 1) + # --- Add diagnostic logging --- + all_profiles_in_pool = profile_manager_instance.list_profiles() + profiles_in_prefix = [p for p in all_profiles_in_pool if p['name'].startswith(profile_prefix)] + if profiles_in_prefix: + state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) + states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) + logger.info(f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix}*): {states_summary}. Pausing for {polling_interval}s.") + else: + logger.info(f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix}'. Pausing for {polling_interval}s.") + # --- End diagnostic logging --- + time.sleep(polling_interval) + continue + + profile_name = locked_profile['name'] + proxy_url = locked_profile['proxy'] + + # 2. Get a batch of URLs from the shared list + url_batch, start_idx = state_manager.get_next_url_batch(batch_size, urls_list) + if not url_batch: + logger.info(f"[Worker {worker_id}] No more URLs to process. Worker exiting.") + break # Exit the while loop + + url_batch_len = len(url_batch) + batch_started = True + + # --- Calculate how many download tasks will be generated --- + downloads_per_url = 0 # Default to 0, meaning no increment unless configured + downloads_per_url_config = gen_policy.get('downloads_per_url') + if downloads_per_url_config: + if isinstance(downloads_per_url_config, int): + downloads_per_url = downloads_per_url_config + elif downloads_per_url_config == 'from_download_policy': + download_policy = policy.get('download_policy', {}) + formats_str = download_policy.get('formats', '') + if formats_str: + # Use smarter parsing to handle complex yt-dlp format selectors + if any(c in formats_str for c in '/+[]()'): + num_formats = 1 + else: + num_formats = len([f for f in formats_str.split(',') if f.strip()]) + + if num_formats > 0: + downloads_per_url = num_formats + + if downloads_per_url > 0: + downloads_to_increment = url_batch_len * downloads_per_url + profile_manager_instance.increment_pending_downloads(profile_name, downloads_to_increment) + logger.info(f"[Worker {worker_id}] [{profile_name}] Preemptively incremented pending downloads by {downloads_to_increment} for the upcoming batch ({url_batch_len} URLs * {downloads_per_url} formats).") + else: + logger.warning(f"[Worker {worker_id}] [{profile_name}] 'downloads_per_url' is not configured or resolves to 0. Pending downloads counter will not be incremented for this batch.") + + end_idx = start_idx + len(url_batch) + logger.info(f"[Worker {worker_id}] [{profile_name}] Processing batch of {len(url_batch)} URLs (lines {start_idx + 1}-{end_idx} from source).") + + video_ids_in_batch = {sp_utils.get_video_id(u) for u in url_batch} + + # 3. Write URLs to a temporary batch file + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as f: + temp_batch_file = f.name + f.write('\n'.join(url_batch)) + + # 4. Construct and run the command + ytdlp_cmd_str = direct_policy.get('ytdlp_command') + if not ytdlp_cmd_str: + logger.error(f"[Worker {worker_id}] Direct batch mode requires 'direct_batch_cli_policy.ytdlp_command'.") + break + + cmd = shlex.split(ytdlp_cmd_str) + cmd.extend(['--batch-file', temp_batch_file]) + cmd.extend(['--proxy', proxy_url]) + + # The output template should not include the .info.json extension, as + # yt-dlp adds it automatically when --write-info-json is used. + output_template_str = direct_policy.get('ytdlp_output_template', '%(id)s') + ytdlp_args = direct_policy.get('ytdlp_args') + + custom_env = direct_policy.get('env_vars', {}).copy() + + # --- PYTHONPATH for custom yt-dlp module --- + ytdlp_module_path = direct_policy.get('ytdlp_module_path') + if ytdlp_module_path: + existing_pythonpath = custom_env.get('PYTHONPATH', os.environ.get('PYTHONPATH', '')) + # Prepend the custom path to PYTHONPATH to give it precedence + custom_env['PYTHONPATH'] = f"{ytdlp_module_path}{os.pathsep}{existing_pythonpath}".strip(os.pathsep) + logger.debug(f"[Worker {worker_id}] Using custom PYTHONPATH: {custom_env['PYTHONPATH']}") + + custom_env['YTDLP_PROFILE_NAME'] = profile_name + custom_env['YTDLP_PROXY_URL'] = proxy_url + env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') + custom_env['YTDLP_SIM_MODE'] = env_name + + # Create a per-profile cache directory and set XDG_CACHE_HOME + cache_dir_base = direct_policy.get('cache_dir_base', '.cache') + profile_cache_dir = os.path.join(cache_dir_base, profile_name) + try: + os.makedirs(profile_cache_dir, exist_ok=True) + custom_env['XDG_CACHE_HOME'] = profile_cache_dir + except OSError as e: + logger.error(f"[Worker {worker_id}] Failed to create cache directory '{profile_cache_dir}': {e}") + + # --- Manage User-Agent --- + # Use a consistent User-Agent per profile, storing it in the profile's cache directory. + user_agent = None + user_agent_file = os.path.join(profile_cache_dir, 'user_agent.txt') + try: + if os.path.exists(user_agent_file): + with open(user_agent_file, 'r', encoding='utf-8') as f: + user_agent = f.read().strip() + + if not user_agent: # File doesn't exist or is empty + user_agent = sp_utils.generate_user_agent_from_policy(policy) + with open(user_agent_file, 'w', encoding='utf-8') as f: + f.write(user_agent) + logger.info(f"[{profile_name}] Generated and saved new User-Agent: '{user_agent}'") + else: + logger.info(f"[{profile_name}] Using existing User-Agent from cache: '{user_agent}'") + except IOError as e: + logger.error(f"[Worker {worker_id}] Error accessing User-Agent file '{user_agent_file}': {e}. Using generated UA for this run.") + user_agent = sp_utils.generate_user_agent_from_policy(policy) # fallback + + # Add proxy rename from policy if specified, for custom yt-dlp forks + proxy_rename = direct_policy.get('ytdlp_proxy_rename') + if proxy_rename: + custom_env['YTDLP_PROXY_RENAME'] = proxy_rename + + if user_agent: + cmd.extend(['--user-agent', user_agent]) + + if ytdlp_args: + cmd.extend(shlex.split(ytdlp_args)) + + if args.verbose and '--verbose' not in cmd: + cmd.append('--verbose') + + if args.dummy_batch: + # In dummy batch mode, we simulate the entire batch process directly. + log_cmd = list(cmd) + log_cmd.extend(['-o', os.path.join('temp_dir', output_template_str)]) + logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY BATCH MODE: Simulating batch of {len(url_batch)} URLs.") + logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY BATCH MODE: Would run real command: {' '.join(shlex.quote(s) for s in log_cmd)}") + logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY BATCH MODE: With environment: {custom_env}") + + dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) + auth_failure_rate = dummy_settings.get('auth_failure_rate', 0.0) + auth_skipped_rate = dummy_settings.get('auth_skipped_failure_rate', 0.0) + min_seconds = dummy_settings.get('auth_min_seconds', 0.1) + max_seconds = dummy_settings.get('auth_max_seconds', 0.5) + + for url in url_batch: + time.sleep(random.uniform(min_seconds, max_seconds)) + video_id = sp_utils.get_video_id(url) or f"dummy_{random.randint(1000, 9999)}" + + rand_val = random.random() + if rand_val < auth_skipped_rate: + logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating tolerated failure for {video_id}.") + profile_manager_instance.record_activity(profile_name, 'tolerated_error') + elif rand_val < (auth_skipped_rate + auth_failure_rate): + logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating fatal failure for {video_id}.") + profile_manager_instance.record_activity(profile_name, 'failure') + else: + # Success - create dummy info.json + files_created += 1 + profile_manager_instance.record_activity(profile_name, 'success') + info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True} + env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') + info_data['_ytops_metadata'] = { + 'profile_name': profile_name, 'proxy_url': proxy_url, + 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), + 'auth_env': env_name + } + + final_path = Path(save_dir) / f"{video_id}.info.json" + rename_template = direct_policy.get('rename_file_template') + if rename_template: + sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) + new_name = rename_template.format(video_id=video_id, profile_name=profile_name, proxy=sanitized_proxy) + final_path = Path(save_dir) / new_name + + try: + with open(final_path, 'w', encoding='utf-8') as f: + json.dump(info_data, f, indent=2) + logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY: Created dummy info.json: '{final_path}'") + except IOError as e: + logger.error(f"[Worker {worker_id}] [{profile_name}] DUMMY: Failed to write dummy info.json: {e}") + + success = (files_created > 0) + state_manager.record_batch_result(success, len(url_batch), profile_name=profile_name) + event = { 'type': 'fetch_batch', 'profile': profile_name, 'proxy_url': proxy_url, 'success': success, 'details': f"Dummy batch completed. Files created: {files_created}/{len(url_batch)}.", 'video_count': len(url_batch) } + state_manager.log_event(event) + + else: + with tempfile.TemporaryDirectory(prefix=f"ytdlp-batch-{worker_id}-") as temp_output_dir: + output_template = os.path.join(temp_output_dir, output_template_str) + cmd.extend(['-o', output_template]) + + logger.info(f"[Worker {worker_id}] [{profile_name}] Processing batch of {len(url_batch)} URLs...") + logger.info(f"[Worker {worker_id}] [{profile_name}] Running command: {' '.join(shlex.quote(s) for s in cmd)}") + logger.info(f"[Worker {worker_id}] [{profile_name}] With environment: {custom_env}") + retcode, stdout, stderr = run_command( + cmd, running_processes, process_lock, env=custom_env, stream_output=args.verbose, + stream_prefix=f"[Worker {worker_id} | yt-dlp] " + ) + + is_bot_error = "Sign in to confirm you're not a bot" in stderr + if is_bot_error: + logger.warning(f"[Worker {worker_id}] [{profile_name}] Bot detection occurred during batch. Marking as failure.") + + processed_files = list(Path(temp_output_dir).glob('*.json')) + + for temp_path in processed_files: + files_created += 1 + video_id = "unknown" + try: + with open(temp_path, 'r+', encoding='utf-8') as f: + info_data = json.load(f) + video_id = info_data.get('id', 'unknown') + env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') + info_data['_ytops_metadata'] = { + 'profile_name': profile_name, + 'proxy_url': proxy_url, + 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), + 'auth_env': env_name + } + f.seek(0) + json.dump(info_data, f, indent=2) + f.truncate() + + final_path = Path(save_dir) / temp_path.name + rename_template = direct_policy.get('rename_file_template') + if rename_template: + sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) + new_name = rename_template.format( + video_id=video_id, profile_name=profile_name, proxy=sanitized_proxy + ) + final_path = Path(save_dir) / new_name + + shutil.move(str(temp_path), str(final_path)) + logger.info(f"[Worker {worker_id}] Post-processed and moved info.json to '{final_path}'") + + except (IOError, json.JSONDecodeError, OSError) as e: + logger.error(f"[Worker {worker_id}] Error post-processing '{temp_path.name}' (video: {video_id}): {e}") + + # The orchestrator records per-URL success/failure for the profile. + # A batch is considered an overall success for logging if it had no fatal errors + # and produced at least one file. + success = (files_created > 0 and not is_bot_error) + + if not success: + reason = "bot detection occurred" if is_bot_error else f"0 files created out of {len(url_batch)}" + logger.warning(f"[Worker {worker_id}] [{profile_name}] Marking batch as FAILED. Reason: {reason}.") + + # Record batch stats for overall orchestrator health + state_manager.record_batch_result(success, len(url_batch), profile_name=profile_name) + + # In this mode, the custom yt-dlp script is responsible for recording + # per-URL activity ('success', 'failure', 'tolerated_error') directly into Redis. + # The orchestrator does not record activity here to avoid double-counting. + logger.info(f"[Worker {worker_id}] [{profile_name}] Batch finished. Per-URL activity was recorded by the yt-dlp script.") + + event_details = f"Batch completed. Exit: {retcode}. Files created: {files_created}/{len(url_batch)}." + if not success and stderr: + if is_bot_error: + event_details += " Stderr: Bot detection occurred." + else: + event_details += f" Stderr: {stderr.strip().splitlines()[-1]}" + + event = { 'type': 'fetch_batch', 'profile': profile_name, 'proxy_url': proxy_url, 'success': success, 'details': event_details, 'video_count': len(url_batch) } + state_manager.log_event(event) + + except Exception as e: + logger.error(f"[Worker {worker_id}] Unexpected error in worker loop: {e}", exc_info=True) + if locked_profile: + profile_manager_instance.record_activity(locked_profile['name'], 'failure') + finally: + if locked_profile and batch_started: + # --- Reconcile pending downloads counter --- + if downloads_per_url > 0: + initial_increment = url_batch_len * downloads_per_url + actual_downloads = files_created * downloads_per_url + adjustment = actual_downloads - initial_increment + if adjustment != 0: + logger.warning(f"[Worker {worker_id}] [{profile_name}] Reconciling pending downloads. Batch created {files_created}/{url_batch_len} successful info.json(s). Adjusting counter by {adjustment}.") + profile_manager_instance.increment_pending_downloads(locked_profile['name'], adjustment) + + if locked_profile: + last_used_profile_name = locked_profile['name'] + cooldown = None + # DESIGN: The cooldown duration is not configured in the worker's policy. + # Instead, it is read from a central Redis key. This key is set by the + # policy-enforcer, making the enforcer the single source of truth for + # this policy. This allows changing the cooldown behavior without + # restarting the workers. + cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') + if cooldown_config: + try: + val = json.loads(cooldown_config) + if isinstance(val, list) and len(val) == 2 and val[0] < val[1]: + cooldown = random.randint(val[0], val[1]) + elif isinstance(val, int): + cooldown = val + except (json.JSONDecodeError, TypeError): + if cooldown_config.isdigit(): + cooldown = int(cooldown_config) + + if cooldown: + logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") + + profile_manager_instance.unlock_profile( + locked_profile['name'], + owner=owner_id, + rest_for_seconds=cooldown + ) + if temp_batch_file and os.path.exists(temp_batch_file): + os.unlink(temp_batch_file) + + logger.info(f"[Worker {worker_id}] Worker loop finished.") + return [] \ No newline at end of file diff --git a/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py b/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py new file mode 100644 index 0000000..908e64b --- /dev/null +++ b/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py @@ -0,0 +1,1363 @@ +import collections +import json +import logging +import os +import random +import re +import shlex +import sys +import tempfile +import shutil +import subprocess +import threading +import time +from copy import deepcopy +from datetime import datetime, timezone +from pathlib import Path + +from . import utils as sp_utils +from .process_runners import run_command, run_docker_container, get_worker_id +from ..profile_manager_tool import ProfileManager +from .worker_utils import find_task_and_lock_profile, get_auth_manager, _cleanup_media_file, _run_ffprobe + + +logger = logging.getLogger(__name__) + + +def _post_process_and_move_info_json(temp_path, profile_name, proxy_url, policy, worker_id, state_manager, profile_manager_instance): + """Reads a temporary info.json, injects metadata, and moves it to the final directory.""" + direct_policy = policy.get('direct_docker_cli_policy', {}) + save_dir = policy.get('settings', {}).get('save_info_json_dir') + + try: + with open(temp_path, 'r+', encoding='utf-8') as f: + info_data = json.load(f) + + env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') + info_data['_ytops_metadata'] = { + 'profile_name': profile_name, + 'proxy_url': proxy_url, + 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), + 'auth_env': env_name + } + + f.seek(0) + json.dump(info_data, f, indent=2) + f.truncate() + + video_id = info_data.get('id') or temp_path.stem + final_path = Path(save_dir) / f"{video_id}.info.json" + + rename_template = direct_policy.get('rename_file_template') + if rename_template: + sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) + new_name = rename_template.format(video_id=video_id, profile_name=profile_name, proxy=sanitized_proxy) + final_path = Path(save_dir) / new_name + + shutil.move(str(temp_path), str(final_path)) + logger.info(f"[{profile_name}] Post-processed and moved info.json to '{final_path}'") + return True + except (IOError, json.JSONDecodeError, OSError) as e: + logger.error(f"[{profile_name}] Failed to post-process '{temp_path.name}': {e}") + return False + + +def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock, profile_prefix=None): + """A worker for the 'direct_docker_cli' orchestration mode (fetch_only).""" + owner_id = f"direct-docker-worker-{worker_id}" + settings = policy.get('settings', {}) + exec_control = policy.get('execution_control', {}) + gen_policy = policy.get('info_json_generation_policy', {}) + direct_policy = policy.get('direct_docker_cli_policy', {}) + queue_policy = policy.get('queue_policy') + + # Prioritize the passed-in profile_prefix for worker pool compatibility. + # If it's not passed (e.g. legacy 'workers' mode), fall back to policy. + if not profile_prefix: + profile_prefix = gen_policy.get('profile_prefix') + + # Unlike other modes, this worker can function without a prefix (it will try to lock any active profile). + # The check `if not profile_prefix` is removed to allow this flexibility. + + batch_size = direct_policy.get('batch_size') + if not batch_size and queue_policy: + batch_size = queue_policy.get('batch_size') + + if not batch_size: + logger.error(f"[Worker {worker_id}] Direct docker mode requires 'batch_size' in 'direct_docker_cli_policy' or 'queue_policy'. Worker exiting.") + return [] + + save_dir = settings.get('save_info_json_dir') + if not save_dir: + logger.error(f"[Worker {worker_id}] Direct docker mode requires 'settings.save_info_json_dir'. Worker exiting.") + return [] + os.makedirs(save_dir, exist_ok=True) + + # --- Docker specific config --- + image_name = direct_policy.get('docker_image_name') + host_mount_path = os.path.abspath(direct_policy.get('docker_host_mount_path')) + container_mount_path = direct_policy.get('docker_container_mount_path') + host_cache_path = direct_policy.get('docker_host_cache_path') + if host_cache_path: host_cache_path = os.path.abspath(host_cache_path) + container_cache_path = direct_policy.get('docker_container_cache_path') + network_name = direct_policy.get('docker_network_name') + + if not all([image_name, host_mount_path, container_mount_path]): + logger.error(f"[Worker {worker_id}] Direct docker mode requires 'docker_image_name', 'docker_host_mount_path', and 'docker_container_mount_path'. Worker exiting.") + return [] + + try: + os.makedirs(host_mount_path, exist_ok=True) + except OSError as e: + logger.error(f"[Worker {worker_id}] Could not create docker_host_mount_path '{host_mount_path}': {e}. Worker exiting.") + return [] + + last_used_profile_name = None + while not state_manager.shutdown_event.is_set(): + locked_profile = None + temp_task_dir_host = None + task_batch = [] + # --- Variables for robust finalization --- + url_batch_len = 0 + batch_started = False + downloads_per_url = 0 # Default to 0, meaning no increment unless configured + # --- + try: + # 1. Lock a profile + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) + + # --- New logic to avoid immediate reuse --- + avoid_reuse = direct_policy.get('avoid_immediate_profile_reuse', False) + if avoid_reuse and locked_profile and last_used_profile_name and locked_profile['name'] == last_used_profile_name: + logger.info(f"[Worker {worker_id}] Re-locked same profile '{locked_profile['name']}'. Unlocking and pausing to allow for rotation.") + profile_manager_instance.unlock_profile(locked_profile['name'], owner=owner_id) + + wait_seconds = direct_policy.get('avoid_reuse_max_wait_seconds', 5) + time.sleep(wait_seconds) + + # After waiting, try to lock again. + logger.info(f"[Worker {worker_id}] Attempting to lock a new profile after waiting.") + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) + + if locked_profile and locked_profile['name'] == last_used_profile_name: + logger.warning(f"[Worker {worker_id}] Still locking the same profile '{locked_profile['name']}' after waiting. Proceeding to use it to avoid getting stuck.") + elif locked_profile: + logger.info(f"[Worker {worker_id}] Switched to a different profile after waiting: '{locked_profile['name']}'.") + # --- End new logic --- + + if not locked_profile: + polling_interval = exec_control.get('worker_polling_interval_seconds', 1) + # --- Add diagnostic logging --- + all_profiles_in_pool = profile_manager_instance.list_profiles() + profiles_in_prefix = [p for p in all_profiles_in_pool if p['name'].startswith(profile_prefix or '')] + if profiles_in_prefix: + state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) + states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) + logger.info(f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s.") + else: + logger.info(f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s.") + # --- End diagnostic logging --- + time.sleep(polling_interval) + continue + + profile_name = locked_profile['name'] + proxy_url = locked_profile['proxy'] + + # --- Manage User-Agent, Visitor ID, and Cache Directory --- + user_agent = None + visitor_id = None + environment = {} + profile_cache_dir_host = None + if host_cache_path: + profile_cache_dir_host = os.path.join(host_cache_path, profile_name) + try: + os.makedirs(profile_cache_dir_host, exist_ok=True) + if container_cache_path: + environment['XDG_CACHE_HOME'] = container_cache_path + + # --- User-Agent --- + user_agent_file = os.path.join(profile_cache_dir_host, 'user_agent.txt') + if os.path.exists(user_agent_file): + with open(user_agent_file, 'r', encoding='utf-8') as f: + user_agent = f.read().strip() + + if not user_agent: + user_agent = sp_utils.generate_user_agent_from_policy(policy) + with open(user_agent_file, 'w', encoding='utf-8') as f: + f.write(user_agent) + logger.info(f"[{profile_name}] Generated and saved new User-Agent: '{user_agent}'") + else: + logger.info(f"[{profile_name}] Using existing User-Agent from cache: '{user_agent}'") + + # --- Visitor ID --- + if direct_policy.get('track_visitor_id'): + visitor_id_file = os.path.join(profile_cache_dir_host, 'visitor_id.txt') + if os.path.exists(visitor_id_file): + with open(visitor_id_file, 'r', encoding='utf-8') as f: + visitor_id = f.read().strip() + if visitor_id: + logger.info(f"[{profile_name}] Using existing Visitor ID from cache: '{visitor_id}'") + + except IOError as e: + logger.error(f"[Worker {worker_id}] Error accessing cache file in '{profile_cache_dir_host}': {e}. Using generated UA for this run.") + user_agent = sp_utils.generate_user_agent_from_policy(policy) # Fallback for UA + else: + # Fallback if no cache is configured for auth simulation + user_agent = sp_utils.generate_user_agent_from_policy(policy) + + # 2. Get a batch of URLs + url_batch = [] + start_idx = 0 + + if not queue_policy: + url_batch, start_idx = state_manager.get_next_url_batch(batch_size, urls_list) + else: + task_batch = state_manager.get_auth_tasks_batch(batch_size) + if task_batch: + # Mark tasks as in-progress + for task in task_batch: + task_id = task.get('id') or task.get('task_id') + if task_id: + state_manager.mark_auth_in_progress(task_id, owner_id) + url_batch = [task.get('url') for task in task_batch if task.get('url')] + + if not url_batch: + if not queue_policy: + logger.info(f"[Worker {worker_id}] No more URLs to process. Worker exiting.") + break # Exit the while loop + else: + # Queue mode: no tasks, unlock and poll + polling_interval = exec_control.get('worker_polling_interval_seconds', 1) + logger.debug(f"[Worker {worker_id}] No tasks in queue for profile '{profile_name}'. Unlocking and sleeping for {polling_interval}s.") + profile_manager_instance.unlock_profile(profile_name, owner=owner_id) + locked_profile = None + time.sleep(polling_interval) + continue + + url_batch_len = len(url_batch) + batch_started = True + + # --- Calculate how many download tasks will be generated --- + # The "pending downloads" counter for an auth profile tracks the number of + # info.json files it has generated that are waiting to be processed. + # Each successful URL fetch creates one info.json file (one task). + # The number of actual media files downloaded from that info.json is + # irrelevant to the auth profile's counter. + downloads_per_url = 0 # Default to 0, meaning no increment unless configured + downloads_per_url_config = gen_policy.get('downloads_per_url') + if downloads_per_url_config: + downloads_per_url = 1 # We just need a flag to enable the logic, the count is per-URL. + + if downloads_per_url > 0: + # Increment by the number of URLs in the batch. + downloads_to_increment = url_batch_len + profile_manager_instance.increment_pending_downloads(profile_name, downloads_to_increment) + logger.info(f"[Worker {worker_id}] [{profile_name}] Preemptively incremented pending downloads by {downloads_to_increment} for the upcoming batch of {url_batch_len} URLs.") + else: + logger.warning(f"[Worker {worker_id}] [{profile_name}] 'downloads_per_url' is not configured. Pending downloads counter will not be incremented for this batch.") + + end_idx = start_idx + len(url_batch) + logger.info(f"[Worker {worker_id}] [{profile_name}] Processing batch of {len(url_batch)} URLs (lines {start_idx + 1}-{end_idx} from source).") + + # 3. Prepare files on the host + temp_task_dir_host = tempfile.mkdtemp(prefix=f"docker-task-{worker_id}-", dir=host_mount_path) + task_dir_name = os.path.basename(temp_task_dir_host) + task_dir_container = os.path.join(container_mount_path, task_dir_name) + + # The config file path is passed explicitly to yt-dlp via --config-locations, + # so setting XDG_CONFIG_HOME in the environment is redundant. + + # Write batch file + temp_batch_file_host = os.path.join(temp_task_dir_host, 'batch.txt') + with open(temp_batch_file_host, 'w', encoding='utf-8') as f: + f.write('\n'.join(url_batch)) + + # Write yt-dlp config file + base_config_content = "" + base_config_file = direct_policy.get('ytdlp_config_file') + if base_config_file: + # Try path as-is first, then relative to project root. + config_path_to_read = Path(base_config_file) + if not config_path_to_read.exists(): + config_path_to_read = Path(sp_utils._PROJECT_ROOT) / base_config_file + + if config_path_to_read.exists(): + try: + with open(config_path_to_read, 'r', encoding='utf-8') as f: + base_config_content = f.read() + logger.info(f"[Worker {worker_id}] [{profile_name}] Loaded base config from '{config_path_to_read}'") + except IOError as e: + logger.error(f"[Worker {worker_id}] Could not read ytdlp_config_file '{config_path_to_read}': {e}") + else: + logger.error(f"[Worker {worker_id}] Could not find ytdlp_config_file: '{base_config_file}'") + + config_overrides = direct_policy.get('ytdlp_config_overrides', {}).copy() + + if direct_policy.get('use_cookies') and host_cache_path and container_cache_path: + try: + cookie_file_host = os.path.join(profile_cache_dir_host, 'cookies.txt') + # Ensure the file exists and has the Netscape header if it's empty. + if not os.path.exists(cookie_file_host) or os.path.getsize(cookie_file_host) == 0: + with open(cookie_file_host, 'w', encoding='utf-8') as f: + f.write("# Netscape HTTP Cookie File\n") + logger.info(f"[{profile_name}] Created/initialized cookie file with header: {cookie_file_host}") + + cookie_file_container = os.path.join(container_cache_path, 'cookies.txt') + config_overrides['cookies'] = cookie_file_container + logger.info(f"[{profile_name}] Using persistent cookie jar: {cookie_file_host}") + except (IOError, OSError) as e: + logger.error(f"[Worker {worker_id}] Could not create cookie file in '{profile_cache_dir_host}': {e}") + + # Inject per-task values into overrides + config_overrides['proxy'] = proxy_url + config_overrides['batch-file'] = os.path.join(task_dir_container, 'batch.txt') + # The output template should not include the .info.json extension, as + # yt-dlp adds it automatically when --write-info-json is used. + config_overrides['output'] = os.path.join(task_dir_container, '%(id)s') + if user_agent: + config_overrides['user-agent'] = user_agent + + overrides_content = sp_utils._config_dict_to_flags_file_content(config_overrides) + raw_args_from_policy = direct_policy.get('ytdlp_raw_args', []) + + # --- Inject visitor_id into raw args if available --- + if visitor_id: + # Start with a copy of the raw args from policy + new_raw_args = list(raw_args_from_policy) + + # --- Handle youtube extractor args --- + youtube_arg_index = -1 + original_youtube_value = None + for i, arg in enumerate(new_raw_args): + if arg.startswith('--extractor-args') and 'youtube:' in arg: + youtube_arg_index = i + try: + parts = shlex.split(arg) + if len(parts) == 2 and parts[1].startswith('youtube:'): + original_youtube_value = parts[1] + except ValueError: + logger.warning(f"Could not parse extractor-arg, will not modify: {arg}") + break # Found it, stop searching + + if youtube_arg_index != -1 and original_youtube_value: + # Modify existing youtube arg + new_value = f'{original_youtube_value.rstrip()};visitor_data={visitor_id}' + if 'skip=' in new_value: + new_value = re.sub(r'skip=([^;\'"]*)', r'skip=\1,webpage,configs', new_value) + else: + new_value += ';skip=webpage,configs' + new_raw_args[youtube_arg_index] = f'--extractor-args "{new_value}"' + else: + # Add new youtube arg + logger.warning(f"[{profile_name}] No existing '--extractor-args youtube:...' found. Adding a new one for visitor_id.") + new_raw_args.append(f'--extractor-args "youtube:visitor_data={visitor_id};skip=webpage,configs"') + + # --- Handle youtubetab extractor args --- + youtubetab_arg_index = -1 + for i, arg in enumerate(new_raw_args): + if arg.startswith('--extractor-args') and 'youtubetab:' in arg: + youtubetab_arg_index = i + break + + # The request is to set/replace this argument + new_youtubetab_arg = '--extractor-args "youtubetab:skip=webpage"' + if youtubetab_arg_index != -1: + # Replace existing + new_raw_args[youtubetab_arg_index] = new_youtubetab_arg + else: + # Add new + new_raw_args.append(new_youtubetab_arg) + + raw_args_from_policy = new_raw_args + # --- End visitor_id injection --- + + raw_args_content = '\n'.join(raw_args_from_policy) + + config_content = f"{base_config_content.strip()}\n\n# --- Overrides from policy ---\n{overrides_content}" + if raw_args_content: + config_content += f"\n\n# --- Raw args from policy ---\n{raw_args_content}" + + logger.info(f"[Worker {worker_id}] [{profile_name}] Generated yt-dlp config file content:\n---config---\n{config_content}\n------------") + + # Create the directory structure yt-dlp expects inside the temp task dir + ytdlp_config_dir_host = os.path.join(temp_task_dir_host, 'yt-dlp') + os.makedirs(ytdlp_config_dir_host, exist_ok=True) + temp_config_file_host = os.path.join(ytdlp_config_dir_host, 'config') + with open(temp_config_file_host, 'w', encoding='utf-8') as f: + f.write(config_content) + + # 4. Construct and run the 'docker run' command + volumes = { + host_mount_path: { + 'bind': container_mount_path, + 'mode': 'rw' + } + } + if host_cache_path and container_cache_path: + profile_cache_dir_host = os.path.join(host_cache_path, profile_name) + os.makedirs(profile_cache_dir_host, exist_ok=True) + volumes[profile_cache_dir_host] = { + 'bind': container_cache_path, + 'mode': 'rw' + } + + # The command tells yt-dlp exactly where to find the config file we created. + command = ['yt-dlp', '--config-locations', os.path.join(task_dir_container, 'yt-dlp/config')] + logger.info(f"[Worker {worker_id}] [{profile_name}] Running docker command: {' '.join(shlex.quote(s) for s in command)}") + + # For logging purposes, construct the full equivalent command line with host paths + log_config_overrides_for_host = config_overrides.copy() + log_config_overrides_for_host['batch-file'] = temp_batch_file_host + log_config_overrides_for_host['output'] = os.path.join(temp_task_dir_host, '%(id)s') + if 'cookies' in log_config_overrides_for_host and host_cache_path: + log_config_overrides_for_host['cookies'] = os.path.join(profile_cache_dir_host, 'cookies.txt') + + log_command_override = ['yt-dlp'] + if base_config_content: + log_command_override.extend(sp_utils._parse_config_file_to_cli_args(base_config_content)) + log_command_override.extend(sp_utils._config_dict_to_cli_flags(log_config_overrides_for_host)) + for raw_arg in raw_args_from_policy: + log_command_override.extend(shlex.split(raw_arg)) + + # --- Live log parsing and activity recording --- + live_success_count = 0 + live_failure_count = 0 + live_tolerated_count = 0 + activity_lock = threading.Lock() + + # Get error patterns from policy for live parsing + tolerated_error_patterns = direct_policy.get('tolerated_error_patterns', []) + fatal_error_patterns = direct_policy.get('fatal_error_patterns', []) + + def log_parser_callback(line): + nonlocal live_success_count, live_failure_count, live_tolerated_count + + # --- Visitor ID Extraction --- + if direct_policy.get('track_visitor_id') and profile_cache_dir_host: + # e.g., [debug] [youtube] [pot:cache] TRACE: Retrieved cache spec PoTokenCacheSpec(key_bindings={'t': 'webpo', 'cb': '...', 'cbt': 'visitor_id', ... + match = re.search(r"'cb': '([^']*)', 'cbt': 'visitor_id'", line) + if match: + new_visitor_id = match.group(1) + logger.info(f"[Worker {worker_id}] [{profile_name}] Detected new Visitor ID: {new_visitor_id}") + try: + visitor_id_file = os.path.join(profile_cache_dir_host, 'visitor_id.txt') + with open(visitor_id_file, 'w', encoding='utf-8') as f: + f.write(new_visitor_id) + logger.info(f"[{profile_name}] Saved new Visitor ID to cache.") + except IOError as e: + logger.error(f"[{profile_name}] Failed to save new Visitor ID to cache: {e}") + + # Success is the highest priority check + if '[info] Writing video metadata as JSON to:' in line: + post_processed_successfully = False + # --- Immediate post-processing --- + try: + path_match = re.search(r"Writing video metadata as JSON to: '?([^']+)'?$", line) + if not path_match: + path_match = re.search(r"Writing video metadata as JSON to: (.*)$", line) + + if path_match: + container_file_path = path_match.group(1).strip() + + if container_file_path.startswith(container_mount_path): + relative_path = os.path.relpath(container_file_path, container_mount_path) + host_file_path = os.path.join(host_mount_path, relative_path) + + # The file might not exist immediately. + for _ in range(5): # Retry for up to 0.5s + if os.path.exists(host_file_path): + break + time.sleep(0.1) + + if os.path.exists(host_file_path): + post_processed_successfully = _post_process_and_move_info_json( + Path(host_file_path), profile_name, proxy_url, policy, worker_id, + state_manager, profile_manager_instance=profile_manager_instance + ) + else: + logger.warning(f"File from log not found on host for immediate processing: {host_file_path}") + except Exception as e: + logger.error(f"Error during immediate post-processing from log line: {e}") + + with activity_lock: + if post_processed_successfully: + live_success_count += 1 + logger.info(f"[Worker {worker_id}] [{profile_name}] Live success #{live_success_count} detected and post-processed.") + profile_manager_instance.record_activity(profile_name, 'success') + else: + live_failure_count += 1 + logger.error(f"[Worker {worker_id}] [{profile_name}] Post-processing failed for a successful fetch. Recording as failure.") + profile_manager_instance.record_activity(profile_name, 'failure') + # --- End immediate post-processing --- + return False + + # Check for fatal patterns (e.g., bot detection) which might not start with ERROR: + for pattern in fatal_error_patterns: + if re.search(pattern, line, re.IGNORECASE): + with activity_lock: + live_failure_count += 1 + logger.error(f"[Worker {worker_id}] [{profile_name}] Live FATAL error #{live_failure_count} detected from log: {line}") + profile_manager_instance.record_activity(profile_name, 'failure') + if direct_policy.get('ban_on_fatal_error_in_batch'): + logger.warning(f"Banning profile '{profile_name}' immediately due to fatal error to stop container.") + profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during batch') + return True # Signal to stop container + return False # Do not stop if ban_on_fatal_error_in_batch is false + + # Only process lines that contain ERROR: for tolerated/generic failures + if 'ERROR:' not in line: + return False + + # Check if it's a tolerated error + for pattern in tolerated_error_patterns: + if re.search(pattern, line, re.IGNORECASE): + with activity_lock: + live_tolerated_count += 1 + logger.warning(f"[Worker {worker_id}] [{profile_name}] Live TOLERATED error #{live_tolerated_count} detected from log: {line}") + profile_manager_instance.record_activity(profile_name, 'tolerated_error') + return False + + # If it's an ERROR: line and not tolerated, it's a failure + with activity_lock: + live_failure_count += 1 + logger.warning(f"[Worker {worker_id}] [{profile_name}] Live failure #{live_failure_count} detected from log: {line}") + profile_manager_instance.record_activity(profile_name, 'failure') + + return False + + if args.dummy_batch: + logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY BATCH MODE: Simulating Docker batch of {len(url_batch)} URLs.") + logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY BATCH MODE: Would run docker command: {' '.join(shlex.quote(s) for s in command)}") + logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY BATCH MODE: With environment: {environment}") + + dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) + auth_failure_rate = dummy_settings.get('auth_failure_rate', 0.0) + auth_skipped_rate = dummy_settings.get('auth_skipped_failure_rate', 0.0) + min_seconds = dummy_settings.get('auth_min_seconds', 0.1) + max_seconds = dummy_settings.get('auth_max_seconds', 0.5) + + files_created = 0 + for url in url_batch: + time.sleep(random.uniform(min_seconds, max_seconds)) + video_id = sp_utils.get_video_id(url) or f"dummy_{random.randint(1000, 9999)}" + + rand_val = random.random() + if rand_val < auth_skipped_rate: + logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating tolerated failure for {video_id}.") + profile_manager_instance.record_activity(profile_name, 'tolerated_error') + elif rand_val < (auth_skipped_rate + auth_failure_rate): + logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating fatal failure for {video_id}.") + profile_manager_instance.record_activity(profile_name, 'failure') + else: + # Success + files_created += 1 + profile_manager_instance.record_activity(profile_name, 'success') + + # Create a dummy file in the temp output dir to simulate success + dummy_output_path = Path(temp_task_dir_host) / f"{video_id}.info.json" + try: + # Write minimal info to the dummy file so post-processing can succeed + with open(dummy_output_path, 'w', encoding='utf-8') as f: + json.dump({'id': video_id, '_dummy': True}, f) + except OSError as e: + logger.error(f"[Worker {worker_id}] [{profile_name}] DUMMY: Failed to create dummy output file: {e}") + + live_success_count = files_created + + success = (files_created > 0) + state_manager.record_batch_result(success, len(url_batch), profile_name=profile_name) + + logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY Batch processing complete. Worker will now unlock profile and attempt next batch.") + + processed_files = list(Path(temp_task_dir_host).glob('*.json')) + + for temp_path in processed_files: + _post_process_and_move_info_json( + temp_path, profile_name, proxy_url, policy, worker_id, state_manager, profile_manager_instance + ) + continue + + exit_code, stdout, stderr, stop_reason = run_docker_container( + image_name=image_name, + command=command, + volumes=volumes, + stream_prefix=f"[Worker {worker_id} | Docker] ", + network_name=network_name, + log_callback=log_parser_callback, + profile_manager=profile_manager_instance, + profile_name=profile_name, + environment=environment, + log_command_override=log_command_override + ) + + full_output = f"{stdout}\n{stderr}" + if "Sign in to confirm you're not a bot" in full_output: + logger.warning(f"[Worker {worker_id}] [{profile_name}] Bot detection occurred during batch. Marking as failure.") + + # Fallback post-processing for any files missed by the live parser. + # The live parser moves files, so this loop should only find leftovers. + # Post-process all generated info.json files from the temporary output directory + processed_files = list(Path(temp_task_dir_host).glob('*.json')) + post_process_success_count = 0 + for temp_path in processed_files: + if _post_process_and_move_info_json( + temp_path, profile_name, proxy_url, policy, worker_id, state_manager, profile_manager_instance + ): + post_process_success_count += 1 + + # Reconcile counts. If live log parsing failed, use the number of successfully + # moved files as the success count. This is a fallback and less accurate for failures. + if live_success_count == 0 and live_failure_count == 0 and live_tolerated_count == 0: + logger.warning(f"[Worker {worker_id}] [{profile_name}] Live log parsing did not capture any events. Using post-processed file count ({post_process_success_count}) as the success count.") + live_success_count = post_process_success_count + + + # A batch is considered an overall success for logging if it had no fatal errors + # and produced at least one file. + success = (exit_code == 0 or not stop_reason) and post_process_success_count > 0 + + if not success: + reason = f"exit code {exit_code}" if not stop_reason else stop_reason + logger.warning(f"[Worker {worker_id}] [{profile_name}] Marking batch as FAILED. Reason: {reason}.") + + state_manager.record_batch_result(success, len(url_batch), profile_name=profile_name) + + event = { + 'type': 'fetch_batch', + 'profile': profile_name, + 'proxy_url': proxy_url, + 'success': success, + 'details': f"Batch completed. Exit: {exit_code}. Files created: {post_process_success_count}/{len(url_batch)}. Live counts: {live_success_count} OK, {live_failure_count} FAIL, {live_tolerated_count} SKIP.", + 'video_count': len(url_batch) + } + if stop_reason: + event['details'] += f" Stop Reason: {stop_reason}." + + state_manager.log_event(event) + + logger.info(f"[Worker {worker_id}] [{profile_name}] Batch processing complete. Worker will now unlock profile and attempt next batch.") + + except Exception as e: + logger.error(f"[Worker {worker_id}] Unexpected error in worker loop: {e}", exc_info=True) + if locked_profile: + profile_manager_instance.record_activity(locked_profile['name'], 'failure') + finally: + if locked_profile and batch_started: + # --- Reconcile pending downloads counter --- + if downloads_per_url > 0: + # We incremented by url_batch_len at the start. + # Now we adjust for any URLs that failed to produce an info.json. + # The adjustment is the number of successes minus the number of attempts. + initial_increment = url_batch_len + actual_successes = live_success_count + adjustment = actual_successes - initial_increment + + if adjustment != 0: + # The adjustment will be negative, effectively decrementing the counter for each failure. + logger.warning(f"[Worker {worker_id}] [{profile_name}] Reconciling pending downloads. Batch created {live_success_count}/{url_batch_len} successful info.json(s). Adjusting counter by {adjustment}.") + profile_manager_instance.increment_pending_downloads(locked_profile['name'], adjustment) + + if locked_profile: + last_used_profile_name = locked_profile['name'] + cooldown = None + # DESIGN: The cooldown duration is not configured in the worker's policy. + # Instead, it is read from a central Redis key. This key is set by the + # policy-enforcer, making the enforcer the single source of truth for + # this policy. This allows changing the cooldown behavior without + # restarting the workers. + cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') + if cooldown_config: + try: + val = json.loads(cooldown_config) + if isinstance(val, list) and len(val) == 2 and val[0] < val[1]: + cooldown = random.randint(val[0], val[1]) + elif isinstance(val, int): + cooldown = val + except (json.JSONDecodeError, TypeError): + if isinstance(cooldown_config, str) and cooldown_config.isdigit(): + cooldown = int(cooldown_config) + + if cooldown: + logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") + + unlocked_successfully = profile_manager_instance.unlock_profile( + locked_profile['name'], + owner=owner_id, + rest_for_seconds=cooldown + ) + if not unlocked_successfully: + logger.error(f"[Worker {worker_id}] FAILED to unlock profile '{locked_profile['name']}'. The profile may be stuck.") + + if temp_task_dir_host and os.path.exists(temp_task_dir_host): + # If shutdown is requested, a batch might have been interrupted after files were + # created but before they were post-processed. We preserve the temp directory + # to allow for manual recovery of the info.json files. + if state_manager.shutdown_event.is_set() and any(Path(temp_task_dir_host).iterdir()): + logger.warning(f"Shutdown requested. Preserving temporary task directory for manual recovery: {temp_task_dir_host}") + else: + try: + # Use a thread to timeout rmtree in case of hanging + def rmtree_thread(): + shutil.rmtree(temp_task_dir_host) + t = threading.Thread(target=rmtree_thread) + t.daemon = True + t.start() + t.join(timeout=10.0) + if t.is_alive(): + logger.warning(f"[Worker {worker_id}] Timeout removing temp task directory {temp_task_dir_host}. Leaving it.") + # The thread will be killed when daemon exits + except OSError as e: + logger.warning(f"Error removing temp task directory {temp_task_dir_host}: {e}") + + logger.info(f"[Worker {worker_id}] Worker loop finished.") + return [] + + +def run_direct_docker_download_worker(worker_id, policy, state_manager, args, profile_manager_instance, running_processes, process_lock, profile_prefix=None): + """A worker for the 'direct_docker_cli' orchestration mode with `mode: download_only`.""" + logger.info(f"[Worker {worker_id}] Download worker thread started for pool '{profile_prefix or '*'}'.") + try: + owner_id = f"direct-docker-dl-worker-{worker_id}" + settings = policy.get('settings', {}) + exec_control = policy.get('execution_control', {}) + d_policy = policy.get('download_policy', {}) + direct_policy = policy.get('direct_docker_cli_policy', {}) + queue_policy = policy.get('queue_policy') + + # Prioritize the passed-in profile_prefix for worker pool compatibility. + if not profile_prefix: + profile_prefix = d_policy.get('profile_prefix') + + # Unlike other modes, this worker can function without a prefix (it will try to lock any active profile). + # The check `if not profile_prefix` is removed to allow this flexibility. + + # --- Docker specific config --- + image_name = direct_policy.get('docker_image_name') + host_mount_path = direct_policy.get('docker_host_mount_path') + container_mount_path = direct_policy.get('docker_container_mount_path') + host_download_path = direct_policy.get('docker_host_download_path') + container_download_path = direct_policy.get('docker_container_download_path') + network_name = direct_policy.get('docker_network_name') + + if not all([image_name, host_mount_path, container_mount_path, host_download_path, container_download_path]): + logger.error(f"[Worker {worker_id}] Direct docker download mode requires all docker_* keys in 'direct_docker_cli_policy'. Worker exiting.") + return [] + + try: + os.makedirs(host_mount_path, exist_ok=True) + os.makedirs(host_download_path, exist_ok=True) + except OSError as e: + logger.error(f"[Worker {worker_id}] Could not create required host directories: {e}. Worker exiting.") + return [] + + no_task_streak = 0 + last_used_profile_name = None + task_counter = 0 + while not state_manager.shutdown_event.is_set(): + locked_profile = None + claimed_task_path_host = None + temp_config_dir_host = None + was_banned_by_parser = False + task = None + task_id = None + try: + if no_task_streak > 0 and not queue_policy: # Polling only makes sense for file mode + polling_interval = exec_control.get('worker_polling_interval_seconds', 1) + # --- Add diagnostic logging --- + all_profiles_in_pool = profile_manager_instance.list_profiles() + profiles_in_prefix = [p for p in all_profiles_in_pool if p['name'].startswith(profile_prefix or '')] + if profiles_in_prefix: + state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) + states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) + logger.info(f"[Worker {worker_id}] No tasks found or profiles available. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s. (Streak: {no_task_streak})") + else: + logger.info(f"[Worker {worker_id}] No tasks found or profiles available. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s. (Streak: {no_task_streak})") + # --- End diagnostic logging --- + time.sleep(polling_interval) + if state_manager.shutdown_event.is_set(): continue + + # 1. Get a task + if not queue_policy: + # File-based mode: Find a task and lock its associated profile + locked_profile, claimed_task_path_host = find_task_and_lock_profile( + profile_manager_instance, owner_id, profile_prefix, policy, worker_id + ) + else: + # Queue-based mode + task = state_manager.get_download_task() + if task: + task_id = task.get('id') or task.get('task_id') + if not task_id: + task_id = f"dl_task_{worker_id}_{task_counter}" + task_counter += 1 + task['task_id'] = task_id + + info_json_path_str = task.get('info_json_path') + if not info_json_path_str or not os.path.exists(info_json_path_str): + logger.error(f"[Worker {worker_id}] Task {task_id} has invalid info_json_path: {info_json_path_str}. Skipping.") + state_manager.report_download_skipped(task_id, {"error": "Invalid info_json_path", "task": task}) + + auth_profile_name = task.get('auth_profile_name') + auth_env = task.get('auth_env') + if auth_profile_name and auth_env: + auth_manager = get_auth_manager(profile_manager_instance, auth_env) + if auth_manager: + auth_manager.decrement_pending_downloads(auth_profile_name) + continue + + claimed_task_path_host = Path(info_json_path_str) + + # Now lock a profile + specific_profile = task.get('auth_profile_name') or task.get('profile_name') + if specific_profile: + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, specific_profile_name=specific_profile) + if not locked_profile: + logger.warning(f"[Worker {worker_id}] Could not lock specific profile '{specific_profile}'. Trying any profile with prefix.") + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) + else: + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) + + if not locked_profile: + logger.warning(f"[Worker {worker_id}] No profiles available for task {task_id}. Re-queueing.") + state_manager.add_download_tasks_batch([task]) + claimed_task_path_host = None + task = None + + if task: + state_manager.mark_download_in_progress(task_id, owner_id) + + if not locked_profile: + if not queue_policy: + no_task_streak += 1 + else: + # In queue mode, if we didn't get a task or a profile, we just poll. + polling_interval = exec_control.get('worker_polling_interval_seconds', 1) + logger.debug(f"[Worker {worker_id}] No download tasks or profiles available. Sleeping for {polling_interval}s.") + time.sleep(polling_interval) + continue + + profile_name = locked_profile['name'] + # We have a task and a lock. + + # User-Agent is not used for download simulation. + user_agent = None + + if claimed_task_path_host: + no_task_streak = 0 + auth_profile_name, auth_env = None, None + info_data = None + + # --- Read info.json content and metadata first --- + try: + with open(claimed_task_path_host, 'r', encoding='utf-8') as f: + info_data = json.load(f) + # This is critical for decrementing the counter in the finally block + metadata = info_data.get('_ytops_metadata', {}) + auth_profile_name = metadata.get('profile_name') + auth_env = metadata.get('auth_env') + except (IOError, json.JSONDecodeError) as e: + logger.error(f"CRITICAL: Could not read or parse task file '{claimed_task_path_host.name}': {e}. This task will be skipped, but the pending downloads counter CANNOT be decremented.") + continue # Skip to finally block to unlock profile + + if args.dummy or args.dummy_batch: + logger.info(f"========== [Worker {worker_id}] BEGIN DUMMY DOCKER DOWNLOAD PER-FORMAT SIMULATION ==========") + logger.info(f"[Worker {worker_id}] Profile: {profile_name} | Task: {claimed_task_path_host.name}") + + dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) + min_seconds = dummy_settings.get('download_min_seconds', 1.0) + max_seconds = dummy_settings.get('download_max_seconds', 3.0) + failure_rate = dummy_settings.get('download_failure_rate', 0.0) + skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0) + + # In dummy mode, prioritize the format from the task file, then from the policy. + format_selection = info_data.get('_ytops_download_format') + source_of_format = "task file" + if not format_selection: + format_selection = d_policy.get('formats', '') + source_of_format = "policy" + + if not format_selection: + logger.warning(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download.") + formats_to_test = ['dummy_format'] + else: + formats_to_test = [f.strip() for f in format_selection.split(',') if f.strip()] + logger.info(f"[Worker {worker_id}] DUMMY: Simulating downloads for formats (from {source_of_format}): {', '.join(formats_to_test)}") + + for format_id in formats_to_test: + if state_manager.shutdown_event.is_set(): + logger.info(f"[Worker {worker_id}] DUMMY: Shutdown requested, stopping format simulation.") + break + + logger.info(f"[Worker {worker_id}] DUMMY: Simulating download for format '{format_id}'...") + sleep_duration = random.uniform(min_seconds, max_seconds) + logger.info(f"[Worker {worker_id}] DUMMY: Simulating download for {sleep_duration:.2f}s (from policy range {min_seconds}-{max_seconds}s).") + time.sleep(sleep_duration) + + rand_val = random.random() + should_fail_skipped = rand_val < skipped_rate + should_fail_fatal = not should_fail_skipped and rand_val < (skipped_rate + failure_rate) + + success = False + details = "" + error_type = None + is_tolerated_error = False + + if should_fail_skipped: + logger.warning(f"[Worker {worker_id}] DUMMY: Simulating skipped download failure for format '{format_id}'.") + details = f"Dummy skipped failure for format {format_id}" + error_type = "DummySkippedFailure" + is_tolerated_error = True + profile_manager_instance.record_activity(profile_name, 'tolerated_error') + elif should_fail_fatal: + logger.warning(f"[Worker {worker_id}] DUMMY: Simulating fatal download failure for format '{format_id}'.") + details = f"Dummy fatal failure for format {format_id}" + error_type = "DummyFailure" + profile_manager_instance.record_activity(profile_name, 'download_error') + else: + logger.info(f"[Worker {worker_id}] DUMMY: Simulating download success for format '{format_id}'.") + success = True + details = f"Dummy success for format {format_id}" + profile_manager_instance.record_activity(profile_name, 'download') + + event = { + 'type': 'direct_docker_download', + 'profile': profile_name, + 'proxy_url': locked_profile['proxy'], + 'success': success, + 'details': details, + 'error_type': error_type, + 'is_tolerated_error': is_tolerated_error, + 'format': format_id + } + state_manager.log_event(event) + + logger.info(f"========== [Worker {worker_id}] END DUMMY DOCKER DOWNLOAD SIMULATION ==========") + + # In dummy mode, we just rename the file to processed and continue to the finally block. + try: + base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0] + processed_path = Path(f"{base_path_str}.processed") + claimed_task_path_host.rename(processed_path) + logger.debug(f"DUMMY MODE: Renamed processed task file to '{processed_path.name}'.") + except (OSError, IndexError) as e: + logger.error(f"DUMMY MODE: Failed to rename processed task file '{claimed_task_path_host}': {e}") + + continue # Skip to finally block + + # --- Check for URL expiration before running Docker --- + if d_policy.get('check_url_expiration', True): + # Heuristic: check the first available format URL + first_format = next((f for f in info_data.get('formats', []) if 'url' in f), None) + if first_format: + url_to_check = first_format['url'] + time_shift_minutes = d_policy.get('expire_time_shift_minutes', 0) + status, time_left_seconds = sp_utils.check_url_expiry(url_to_check, time_shift_minutes) + + logger.debug(f"[Worker {worker_id}] [{profile_name}] URL expiration check for task '{claimed_task_path_host.name}': status={status}, time_left={time_left_seconds:.0f}s") + + if status == 'expired': + details = "Download URL is expired" + if time_shift_minutes > 0 and time_left_seconds > 0: + logger.warning(f"[Worker {worker_id}] [{profile_name}] Skipping task '{claimed_task_path_host.name}' because its URL will expire in {time_left_seconds/60:.1f}m (within {time_shift_minutes}m time-shift).") + details = f"URL will expire within {time_shift_minutes}m time-shift" + else: + logger.warning(f"[Worker {worker_id}] [{profile_name}] Skipping task '{claimed_task_path_host.name}' because its URL is expired.") + + profile_manager_instance.record_activity(profile_name, 'tolerated_error') + + event = { + 'type': 'direct_docker_download', 'profile': profile_name, + 'proxy_url': locked_profile['proxy'], 'success': False, + 'error_type': 'Skipped (Expired URL)', 'details': details, + 'is_tolerated_error': True + } + state_manager.log_event(event) + + try: + base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0] + processed_path = Path(f"{base_path_str}.processed") + claimed_task_path_host.rename(processed_path) + logger.debug(f"Renamed expired task file to '{processed_path.name}'.") + except (OSError, IndexError) as e: + logger.error(f"Failed to rename expired task file '{claimed_task_path_host}': {e}") + + continue # Skip to the finally block + + # The path to the task file inside the container needs to be relative to the host mount root. + # We must make the task path absolute first to correctly calculate the relative path from the absolute mount path. + relative_task_path = os.path.relpath(os.path.abspath(claimed_task_path_host), host_mount_path) + task_path_container = os.path.join(container_mount_path, relative_task_path) + + # 3. Prepare config file on host in a temporary directory + temp_config_dir_host = tempfile.mkdtemp(prefix=f"docker-dl-config-{worker_id}-", dir=host_mount_path) + config_dir_name = os.path.basename(temp_config_dir_host) + config_dir_container = os.path.join(container_mount_path, config_dir_name) + + environment = {} + + base_config_content = "" + base_config_file = direct_policy.get('ytdlp_config_file') + if base_config_file: + config_path_to_read = Path(base_config_file) + if not config_path_to_read.exists(): + config_path_to_read = Path(sp_utils._PROJECT_ROOT) / base_config_file + if config_path_to_read.exists(): + try: + with open(config_path_to_read, 'r', encoding='utf-8') as base_f: + base_config_content = base_f.read() + except IOError as e: + logger.error(f"[Worker {worker_id}] Could not read ytdlp_config_file '{config_path_to_read}': {e}") + + config_overrides = direct_policy.get('ytdlp_config_overrides', {}).copy() + config_overrides['proxy'] = locked_profile['proxy'] + config_overrides['load-info-json'] = task_path_container + config_overrides['output'] = os.path.join(container_download_path, '%(id)s.f%(format_id)s.%(ext)s') + + # Prevent yt-dlp from using a cache directory. + config_overrides['no-cache-dir'] = True + + overrides_content = sp_utils._config_dict_to_flags_file_content(config_overrides) + raw_args_from_policy = direct_policy.get('ytdlp_raw_args', []) + raw_args_content = '\n'.join(raw_args_from_policy) + + config_content = f"{base_config_content.strip()}\n\n# --- Overrides from policy ---\n{overrides_content}" + if raw_args_content: + config_content += f"\n\n# --- Raw args from policy ---\n{raw_args_content}" + + logger.info(f"[Worker {worker_id}] [{profile_name}] Generated yt-dlp config:\n---config---\n{config_content}\n------------") + + ytdlp_config_dir_host = os.path.join(temp_config_dir_host, 'yt-dlp') + os.makedirs(ytdlp_config_dir_host, exist_ok=True) + temp_config_file_host = os.path.join(ytdlp_config_dir_host, 'config') + with open(temp_config_file_host, 'w', encoding='utf-8') as f: + f.write(config_content) + + # 4. Construct and run docker run command + volumes = { + os.path.abspath(host_mount_path): {'bind': container_mount_path, 'mode': 'ro'}, + os.path.abspath(host_download_path): {'bind': container_download_path, 'mode': 'rw'} + } + # The command tells yt-dlp exactly where to find the config file we created. + command = ['yt-dlp', '--config-locations', os.path.join(config_dir_container, 'yt-dlp/config')] + logger.info(f"[Worker {worker_id}] [{profile_name}] Running docker command: {' '.join(shlex.quote(s) for s in command)}") + + # For logging purposes, construct the full equivalent command line with host paths + log_config_overrides_for_host = config_overrides.copy() + log_config_overrides_for_host['load-info-json'] = str(claimed_task_path_host) + log_config_overrides_for_host['output'] = os.path.join(host_download_path, '%(id)s.f%(format_id)s.%(ext)s') + + log_command_override = ['yt-dlp'] + if base_config_content: + log_command_override.extend(sp_utils._parse_config_file_to_cli_args(base_config_content)) + log_command_override.extend(sp_utils._config_dict_to_cli_flags(log_config_overrides_for_host)) + raw_args_from_policy = direct_policy.get('ytdlp_raw_args', []) + for raw_arg in raw_args_from_policy: + log_command_override.extend(shlex.split(raw_arg)) + + # --- Live log parsing and activity recording --- + live_success_count = 0 + live_failure_count = 0 + live_tolerated_count = 0 + activity_lock = threading.Lock() + + tolerated_error_patterns = direct_policy.get('tolerated_error_patterns', []) + fatal_error_patterns = direct_policy.get('fatal_error_patterns', []) + + def log_parser_callback(line): + nonlocal live_success_count, live_failure_count, live_tolerated_count, was_banned_by_parser + + # Success is a high-priority check. Only record one success per task. + if '[download] 100% of' in line or 'has already been downloaded' in line: + with activity_lock: + # Only count one success per task + if live_success_count == 0: + live_success_count += 1 + logger.info(f"[Worker {worker_id}] [{profile_name}] Live download success detected from log.") + profile_manager_instance.record_activity(profile_name, 'download') + return False + + # Check for fatal patterns + for pattern in fatal_error_patterns: + if re.search(pattern, line, re.IGNORECASE): + with activity_lock: + live_failure_count += 1 + logger.error(f"[Worker {worker_id}] [{profile_name}] Live FATAL download error #{live_failure_count} detected from log: {line}") + profile_manager_instance.record_activity(profile_name, 'download_error') + if direct_policy.get('ban_on_fatal_error_in_batch'): + logger.warning(f"Banning profile '{profile_name}' immediately due to fatal download error to stop container.") + profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during download') + was_banned_by_parser = True + return True # Signal to stop container + return False # Do not stop if ban_on_fatal_error_in_batch is false + + # Only process lines that contain ERROR: for tolerated/generic failures + if 'ERROR:' not in line: + return False + + # Check if it's a tolerated error + for pattern in tolerated_error_patterns: + if re.search(pattern, line, re.IGNORECASE): + with activity_lock: + live_tolerated_count += 1 + logger.warning(f"[Worker {worker_id}] [{profile_name}] Live TOLERATED download error #{live_tolerated_count} detected from log: {line}") + profile_manager_instance.record_activity(profile_name, 'tolerated_error') + return False + + # If it's an ERROR: line and not tolerated, it's a failure + with activity_lock: + live_failure_count += 1 + logger.warning(f"[Worker {worker_id}] [{profile_name}] Live download failure #{live_failure_count} detected from log: {line}") + profile_manager_instance.record_activity(profile_name, 'download_error') + + return False + + retcode, stdout, stderr, stop_reason = run_docker_container( + image_name=image_name, + command=command, + volumes=volumes, + stream_prefix=f"[Worker {worker_id} | docker-ytdlp] ", + network_name=network_name, + log_callback=log_parser_callback, + profile_manager=profile_manager_instance, + profile_name=profile_name, + environment=environment, + log_command_override=log_command_override + ) + + # 5. Post-process and record activity + full_output = f"{stdout}\n{stderr}" + is_bot_error = "Sign in to confirm you're not a bot" in full_output + if is_bot_error: + logger.warning(f"[Worker {worker_id}] [{profile_name}] Bot detection occurred during download. Marking as failure.") + + # --- Final Outcome Determination --- + # Activity is now recorded live by the log parser. This block just determines + # the overall success/failure for logging and event reporting. + success = False + final_outcome = "unknown" + with activity_lock: + if live_success_count > 0: + success = True + final_outcome = "download" + elif live_failure_count > 0 or is_bot_error: + final_outcome = "download_error" + elif live_tolerated_count > 0: + final_outcome = "tolerated_error" + elif retcode == 0: + # Fallback if no logs were matched but exit was clean. + success = True + final_outcome = "download" + logger.warning(f"[Worker {worker_id}] [{profile_name}] No specific success/error log line matched, but exit code is 0. Assuming success, but this may indicate a parsing issue.") + # We record a success here as a fallback, in case the log parser missed it. + profile_manager_instance.record_activity(profile_name, 'download') + else: + final_outcome = "download_error" + logger.warning(f"[Worker {worker_id}] [{profile_name}] No specific error log line matched, but exit code was {retcode}. Recording a generic download_error.") + profile_manager_instance.record_activity(profile_name, 'download_error') + + # --- Airflow Directory Logic --- + if success and d_policy.get('output_to_airflow_ready_dir'): + try: + # Get video_id from the info.json + with open(claimed_task_path_host, 'r', encoding='utf-8') as f: + info_data = json.load(f) + video_id = info_data.get('id') + + if not video_id: + logger.error(f"[{profile_name}] Could not find video ID in '{claimed_task_path_host.name}' for moving files.") + else: + # Scan the download directory for all resulting media files + downloaded_files = [f for f in os.listdir(host_download_path) if not f.endswith(('.part', '.ytdl'))] + logger.info(f"[{profile_name}] Found {len(downloaded_files)} downloaded file(s) to process in '{host_download_path}'.") + + if not downloaded_files: + logger.warning(f"[{profile_name}] Download reported success, but no media files were found in '{host_download_path}'.") + + # --- Prepare the single destination directory for all artifacts --- + now = datetime.now() + rounded_minute = (now.minute // 10) * 10 + timestamp_str = now.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}" + + base_path = d_policy.get('airflow_ready_dir_base_path', 'downloadfiles/videos/ready') + if not os.path.isabs(base_path): + base_path = os.path.join(sp_utils._PROJECT_ROOT, base_path) + final_dir_base = os.path.join(base_path, timestamp_str) + final_dir_path = os.path.join(final_dir_base, video_id) + os.makedirs(final_dir_path, exist_ok=True) + + # --- Copy info.json once --- + new_info_json_name = f"info_{video_id}.json" + dest_info_json_path = os.path.join(final_dir_path, new_info_json_name) + if not os.path.exists(dest_info_json_path): + shutil.copy(claimed_task_path_host, dest_info_json_path) + logger.info(f"[{profile_name}] Copied info.json to {dest_info_json_path}") + + # --- Process each downloaded media file --- + for downloaded_filename in downloaded_files: + downloaded_file_host_path = os.path.join(host_download_path, downloaded_filename) + if not os.path.exists(downloaded_file_host_path): + logger.warning(f"[{profile_name}] File '{downloaded_filename}' disappeared before it could be processed.") + continue + + media_path = Path(downloaded_file_host_path) + + # 1. Run ffprobe + if d_policy.get('run_ffprobe'): + ffprobe_filename = f"ffprobe_{media_path.stem}.json" + # Create ffprobe json in the final destination to avoid moving it + ffprobe_output_path = Path(final_dir_path) / ffprobe_filename + _run_ffprobe(media_path, ffprobe_output_path) + + # 2. Move media file + final_media_path = Path(final_dir_path) / media_path.name + shutil.move(str(media_path), str(final_media_path)) + logger.info(f"[{profile_name}] Moved media file '{media_path.name}' to {final_media_path}") + media_path = final_media_path # Update media_path to its new location + + # 3. Cleanup the media file at its final location + if d_policy.get('cleanup'): + if os.path.exists(media_path): + _cleanup_media_file(media_path) + + except Exception as e: + logger.error(f"[{profile_name}] Failed during post-download processing for Airflow: {e}", exc_info=True) + + event_details = f"Docker download finished. Exit: {retcode}. Final Outcome: {final_outcome}. (Live successes: {live_success_count}, Live failures: {live_failure_count}, Live tolerated: {live_tolerated_count})" + if not success and stderr: + event_details += f" Stderr: {stderr.strip().splitlines()[-1] if stderr.strip() else 'N/A'}" + if stop_reason: + event_details += f" Aborted: {stop_reason}." + + event = { 'type': 'direct_docker_download', 'profile': profile_name, 'proxy_url': locked_profile['proxy'], 'success': success, 'details': event_details } + state_manager.log_event(event) + + logger.info(f"[Worker {worker_id}] [{profile_name}] Task processing complete. Worker will now unlock profile and attempt next task.") + + # 6. Clean up task file + if not queue_policy: + # File-based mode: rename to .processed + try: + # The claimed_task_path_host has a .LOCKED suffix, remove it before adding .processed + base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0] + processed_path = Path(f"{base_path_str}.processed") + claimed_task_path_host.rename(processed_path) + logger.debug(f"[{sp_utils.get_display_name(claimed_task_path_host)}] Renamed processed task file to '{processed_path.name}'.") + except (OSError, IndexError) as e: + logger.error(f"Failed to rename processed task file '{claimed_task_path_host}': {e}") + elif d_policy.get('rename_source_info_json_on_success'): + # Queue-based mode: respect rename policy + source_path_to_rename = task.get('info_json_path') + if success and source_path_to_rename and os.path.exists(source_path_to_rename): + try: + processed_path = source_path_to_rename + ".processed" + shutil.move(source_path_to_rename, processed_path) + logger.info(f"[Worker {worker_id}] Renamed source info.json to '{processed_path}'") + except Exception as e: + logger.warning(f"[Worker {worker_id}] Could not rename source info.json '{source_path_to_rename}': {e}") + # After this point, claimed_task_path_host is no longer valid. + # The metadata has already been read into auth_profile_name and auth_env. + else: + # This case should not be reached with the new task-first locking logic. + logger.warning(f"[Worker {worker_id}] Inconsistent state: locked profile '{profile_name}' but no task was claimed. Unlocking and continuing.") + + except Exception as e: + logger.error(f"[Worker {worker_id}] An unexpected error occurred in the worker loop: {e}", exc_info=True) + if locked_profile: + profile_manager_instance.record_activity(locked_profile['name'], 'failure') # Generic failure + time.sleep(5) + finally: + if locked_profile: + if claimed_task_path_host: + # The auth_profile_name and auth_env variables were populated in the `try` block + # before the task file was renamed or deleted. + if auth_profile_name and auth_env: + auth_manager = get_auth_manager(profile_manager_instance, auth_env) + if auth_manager: + try: + auth_manager.decrement_pending_downloads(auth_profile_name) + logger.info(f"[Worker {worker_id}] Decremented pending downloads for auth profile '{auth_profile_name}' in env '{auth_env}'.") + except Exception as e: + logger.error(f"[Worker {worker_id}] Failed to decrement pending downloads for auth profile '{auth_profile_name}' in env '{auth_env}': {e}", exc_info=True) + else: + logger.error(f"[Worker {worker_id}] Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.") + else: + logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env in info.json metadata. Pending downloads counter will not be decremented. (Profile: {auth_profile_name}, Env: {auth_env})") + + if was_banned_by_parser: + logger.info(f"[Worker {worker_id}] Profile '{locked_profile['name']}' was already banned by the log parser. Skipping unlock/cooldown.") + else: + last_used_profile_name = locked_profile['name'] + cooldown = None + # Only apply cooldown if a task was actually claimed and processed. + if claimed_task_path_host: + # Enforcer is the only point where we configure to apply different policies, + # since we might restart enforcer, but won't restart stress-policy working on auth and downloads simultaneously. + # This is like applying a policy across multiple workers/machines without needing to restart each of them. + # DESIGN: The cooldown duration is not configured in the worker's policy. + # Instead, it is read from a central Redis key. This key is set by the + # policy-enforcer, making the enforcer the single source of truth for + # this policy. This allows changing the cooldown behavior without + # restarting the workers. + cooldown_source_value = profile_manager_instance.get_config('unlock_cooldown_seconds') + source_description = "Redis config" + + if cooldown_source_value is None: + cooldown_source_value = d_policy.get('default_unlock_cooldown_seconds') + source_description = "local policy" + + if cooldown_source_value is not None: + try: + # If from Redis, it's a string that needs parsing. + # If from local policy, it's already an int or list. + val = cooldown_source_value + if isinstance(val, str): + val = json.loads(val) + + if isinstance(val, list) and len(val) == 2 and val[0] < val[1]: + cooldown = random.randint(val[0], val[1]) + elif isinstance(val, int): + cooldown = val + + if cooldown is not None: + logger.debug(f"Determined cooldown from {source_description}: {cooldown_source_value}") + + except (json.JSONDecodeError, TypeError): + if isinstance(cooldown_source_value, str) and cooldown_source_value.isdigit(): + cooldown = int(cooldown_source_value) + logger.debug(f"Determined cooldown from {source_description}: {cooldown_source_value}") + + if cooldown: + logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") + + profile_manager_instance.unlock_profile( + locked_profile['name'], + owner=owner_id, + rest_for_seconds=cooldown + ) + if not queue_policy and claimed_task_path_host and os.path.exists(claimed_task_path_host): + # Clean up .LOCKED file in file-based mode + try: os.remove(claimed_task_path_host) + except OSError: pass + + if task and task_id: + state_manager.remove_download_in_progress(task_id) + + if temp_config_dir_host and os.path.exists(temp_config_dir_host): + try: + # Use a thread to timeout rmtree in case of hanging + def rmtree_thread(): + shutil.rmtree(temp_config_dir_host) + t = threading.Thread(target=rmtree_thread) + t.daemon = True + t.start() + t.join(timeout=10.0) + if t.is_alive(): + logger.warning(f"[Worker {worker_id}] Timeout removing temp config directory {temp_config_dir_host}. Leaving it.") + # The thread will be killed when daemon exits + except OSError: pass + + logger.info(f"[Worker {worker_id}] Worker loop finished.") + return [] + except Exception as e: + logger.error(f"[Worker {worker_id}] A fatal, unhandled error occurred in the worker thread: {e}", exc_info=True) + return [] diff --git a/ytops_client-source/ytops_client/stress_policy/direct_download_worker.py b/ytops_client-source/ytops_client/stress_policy/direct_download_worker.py new file mode 100644 index 0000000..80944d4 --- /dev/null +++ b/ytops_client-source/ytops_client/stress_policy/direct_download_worker.py @@ -0,0 +1,302 @@ +import collections +import json +import logging +import os +import random +import re +import shlex +import sys +import tempfile +import shutil +import subprocess +import threading +import time +from copy import deepcopy +from datetime import datetime, timezone +from pathlib import Path + +from . import utils as sp_utils +from .process_runners import run_command, get_worker_id +from ..profile_manager_tool import ProfileManager +from .worker_utils import find_task_and_lock_profile, get_auth_manager + + +logger = logging.getLogger(__name__) + + +def run_direct_download_worker(worker_id, policy, state_manager, args, profile_manager_instance, running_processes, process_lock, profile_prefix=None): + """A persistent worker for the 'direct_download_cli' orchestration mode.""" + owner_id = f"direct-dl-worker-{worker_id}" + settings = policy.get('settings', {}) + exec_control = policy.get('execution_control', {}) + d_policy = policy.get('download_policy', {}) + direct_policy = policy.get('direct_download_cli_policy', {}) + + # Prioritize the passed-in profile_prefix for worker pool compatibility. + if not profile_prefix: + profile_prefix = d_policy.get('profile_prefix') + + # Unlike other modes, this worker can function without a prefix (it will try to lock any active profile). + # The check `if not profile_prefix` is removed to allow this flexibility. + + output_dir = direct_policy.get('output_dir') + if not output_dir: + logger.error(f"[Worker {worker_id}] Direct download mode requires 'direct_download_cli_policy.output_dir'. Worker exiting.") + return [] + + os.makedirs(output_dir, exist_ok=True) + no_task_streak = 0 + + while not state_manager.shutdown_event.is_set(): + locked_profile = None + claimed_task_path = None + auth_profile_name, auth_env = None, None # For finally block + try: + # 0. If no tasks were found, pause briefly. + if no_task_streak > 0: + polling_interval = exec_control.get('worker_polling_interval_seconds', 1) + # --- Add diagnostic logging --- + all_profiles_in_pool = profile_manager_instance.list_profiles() + profiles_in_prefix = [p for p in all_profiles_in_pool if p['name'].startswith(profile_prefix or '')] + if profiles_in_prefix: + state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) + states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) + logger.info(f"[Worker {worker_id}] No tasks found for available profiles. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s. (Streak: {no_task_streak})") + else: + logger.info(f"[Worker {worker_id}] No tasks found for available profiles. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s. (Streak: {no_task_streak})") + # --- End diagnostic logging --- + time.sleep(polling_interval) + if state_manager.shutdown_event.is_set(): continue + + # 1. Find a task and lock its associated profile + locked_profile, claimed_task_path = find_task_and_lock_profile( + profile_manager_instance, owner_id, profile_prefix, policy, worker_id + ) + + if not locked_profile: + no_task_streak += 1 + # The main loop will pause if the streak continues. + continue + + profile_name = locked_profile['name'] + # We have a task and a lock. + + if claimed_task_path: + no_task_streak = 0 # Reset streak + + # --- Read metadata before processing/deleting file --- + try: + with open(claimed_task_path, 'r', encoding='utf-8') as f: + info_data = json.load(f) + metadata = info_data.get('_ytops_metadata', {}) + auth_profile_name = metadata.get('profile_name') + auth_env = metadata.get('auth_env') + except (IOError, json.JSONDecodeError) as e: + logger.error(f"CRITICAL: Could not read or parse task file '{claimed_task_path.name}': {e}. This task will be skipped, but the pending downloads counter CANNOT be decremented.") + continue # Skip to finally block to unlock profile + + # 3. Construct and run the command + ytdlp_cmd_str = direct_policy.get('ytdlp_command') + if not ytdlp_cmd_str: + logger.error(f"[Worker {worker_id}] Direct download mode requires 'direct_download_cli_policy.ytdlp_command'.") + break + + proxy_url = locked_profile['proxy'] + proxy_rename = direct_policy.get('proxy_rename') + if proxy_rename: + rename_rule = proxy_rename.strip("'\"") + if rename_rule.startswith('s/') and rename_rule.count('/') >= 2: + try: + parts = rename_rule.split('/') + proxy_url = re.sub(parts[1], parts[2], proxy_url) + except (re.error, IndexError): + logger.error(f"[Worker {worker_id}] Invalid proxy_rename rule: {proxy_rename}") + + output_template = os.path.join(output_dir, '%(title)s - %(id)s.%(ext)s') + + cmd = shlex.split(ytdlp_cmd_str) + cmd.extend(['--load-info-json', str(claimed_task_path)]) + cmd.extend(['--proxy', proxy_url]) + cmd.extend(['-o', output_template]) + + ytdlp_args = direct_policy.get('ytdlp_args') + if ytdlp_args: + cmd.extend(shlex.split(ytdlp_args)) + + if args.verbose and '--verbose' not in cmd: + cmd.append('--verbose') + + custom_env = direct_policy.get('env_vars', {}).copy() + + # --- PYTHONPATH for custom yt-dlp module --- + ytdlp_module_path = direct_policy.get('ytdlp_module_path') + if ytdlp_module_path: + existing_pythonpath = custom_env.get('PYTHONPATH', os.environ.get('PYTHONPATH', '')) + custom_env['PYTHONPATH'] = f"{ytdlp_module_path}{os.pathsep}{existing_pythonpath}".strip(os.pathsep) + logger.debug(f"[Worker {worker_id}] Using custom PYTHONPATH: {custom_env['PYTHONPATH']}") + + # Pass profile info to the custom yt-dlp process + custom_env['YTDLP_PROFILE_NAME'] = profile_name + custom_env['YTDLP_PROXY_URL'] = locked_profile['proxy'] # Original proxy + env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') + custom_env['YTDLP_SIM_MODE'] = env_name + + # Create a per-profile cache directory and set XDG_CACHE_HOME + cache_dir_base = direct_policy.get('cache_dir_base', '.cache') + profile_cache_dir = os.path.join(cache_dir_base, profile_name) + try: + os.makedirs(profile_cache_dir, exist_ok=True) + custom_env['XDG_CACHE_HOME'] = profile_cache_dir + except OSError as e: + logger.error(f"[Worker {worker_id}] Failed to create cache directory '{profile_cache_dir}': {e}") + + logger.info(f"[Worker {worker_id}] [{profile_name}] Processing task '{claimed_task_path.name}'...") + if args.dummy or args.dummy_batch: + logger.info(f"========== [Worker {worker_id}] BEGIN DUMMY DIRECT DOWNLOAD ==========") + logger.info(f"[Worker {worker_id}] Profile: {profile_name} | Task: {claimed_task_path.name}") + logger.info(f"[Worker {worker_id}] Would run command: {' '.join(shlex.quote(s) for s in cmd)}") + logger.info(f"[Worker {worker_id}] With environment: {custom_env}") + + dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) + min_seconds = dummy_settings.get('download_min_seconds', 0.5) + max_seconds = dummy_settings.get('download_max_seconds', 1.5) + failure_rate = dummy_settings.get('download_failure_rate', 0.0) + skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0) + + time.sleep(random.uniform(min_seconds, max_seconds)) + + rand_val = random.random() + should_fail_skipped = rand_val < skipped_rate + should_fail_fatal = not should_fail_skipped and rand_val < (skipped_rate + failure_rate) + + success = False + details = "" + error_type = None + is_tolerated_error = False + + if should_fail_skipped: + logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating skipped download failure for task '{claimed_task_path.name}'.") + details = "Dummy skipped failure" + error_type = "DummySkippedFailure" + is_tolerated_error = True + profile_manager_instance.record_activity(profile_name, 'tolerated_error') + elif should_fail_fatal: + logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating fatal download failure for task '{claimed_task_path.name}'.") + details = "Dummy fatal failure" + error_type = "DummyFailure" + profile_manager_instance.record_activity(profile_name, 'download_error') + else: + logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating download success for task '{claimed_task_path.name}'.") + success = True + details = "Dummy success" + profile_manager_instance.record_activity(profile_name, 'download') + + event = { + 'type': 'direct_download', + 'profile': profile_name, + 'proxy_url': locked_profile['proxy'], + 'success': success, + 'details': details, + 'error_type': error_type, + 'is_tolerated_error': is_tolerated_error + } + state_manager.log_event(event) + logger.info(f"========== [Worker {worker_id}] END DUMMY DIRECT DOWNLOAD ==========") + else: + # --- Real execution --- + logger.info(f"[Worker {worker_id}] [{profile_name}] Running command: {' '.join(shlex.quote(s) for s in cmd)}") + retcode, stdout, stderr = run_command( + cmd, running_processes, process_lock, env=custom_env, stream_output=args.verbose, + stream_prefix=f"[Worker {worker_id} | yt-dlp] " + ) + + success = (retcode == 0) + details = "" + error_type = None + is_tolerated_error = False + + if success: + details = "Download successful" + profile_manager_instance.record_activity(profile_name, 'download') + else: + # Check for tolerated errors first + tolerated_patterns = direct_policy.get('tolerated_error_patterns', []) + for pattern in tolerated_patterns: + if re.search(pattern, stderr, re.IGNORECASE): + is_tolerated_error = True + error_type = "ToleratedError" + details = f"Tolerated error: {stderr.strip().splitlines()[-1] if stderr.strip() else 'Unknown'}" + profile_manager_instance.record_activity(profile_name, 'tolerated_error') + break + + if not is_tolerated_error: + error_type = f"Exit Code {retcode}" + details = f"Download failed. Stderr: {stderr.strip().splitlines()[-1] if stderr.strip() else 'Unknown'}" + profile_manager_instance.record_activity(profile_name, 'download_error') + + event = { + 'type': 'direct_download', + 'profile': profile_name, + 'proxy_url': locked_profile['proxy'], + 'success': success, + 'details': details, + 'error_type': error_type, + 'is_tolerated_error': is_tolerated_error + } + state_manager.log_event(event) + + # 4. Clean up the processed task file + try: + # The claimed_task_path has a .LOCKED suffix, remove it before adding .processed + base_path_str = str(claimed_task_path).rsplit('.LOCKED.', 1)[0] + processed_path = Path(f"{base_path_str}.processed") + claimed_task_path.rename(processed_path) + logger.debug(f"[{sp_utils.get_display_name(claimed_task_path)}] Renamed processed task file to '{processed_path.name}'.") + except (OSError, IndexError) as e: + logger.error(f"Failed to rename processed task file '{claimed_task_path}': {e}") + else: + logger.warning(f"[Worker {worker_id}] Inconsistent state: locked profile '{profile_name}' but no task was claimed. Unlocking and continuing.") + + except Exception as e: + logger.error(f"[Worker {worker_id}] An unexpected error occurred in the worker loop: {e}", exc_info=True) + if locked_profile: + profile_manager_instance.record_activity(locked_profile['name'], 'failure') # Generic failure + time.sleep(5) + finally: + if locked_profile: + # Decrement pending downloads counter on the original auth profile + if claimed_task_path and auth_profile_name and auth_env: + auth_manager = get_auth_manager(profile_manager_instance, auth_env) + if auth_manager: + auth_manager.decrement_pending_downloads(auth_profile_name) + else: + logger.error(f"Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.") + elif claimed_task_path: + logger.warning(f"Could not find auth profile name and/or auth_env in info.json metadata. Pending downloads counter will not be decremented.") + + cooldown = None + if claimed_task_path: + cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') + if cooldown_config: + try: + val = json.loads(cooldown_config) + if isinstance(val, list) and len(val) == 2 and val[0] < val[1]: + cooldown = random.randint(val[0], val[1]) + elif isinstance(val, int): + cooldown = val + except (json.JSONDecodeError, TypeError): + if isinstance(cooldown_config, str) and cooldown_config.isdigit(): + cooldown = int(cooldown_config) + + if cooldown: + logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") + + profile_manager_instance.unlock_profile( + locked_profile['name'], + owner=owner_id, + rest_for_seconds=cooldown + ) + locked_profile = None + + logger.info(f"[Worker {worker_id}] Worker loop finished.") + return [] diff --git a/ytops_client-source/ytops_client/stress_policy/queue_workers.py b/ytops_client-source/ytops_client/stress_policy/queue_workers.py index 1a5857f..ee91930 100644 --- a/ytops_client-source/ytops_client/stress_policy/queue_workers.py +++ b/ytops_client-source/ytops_client/stress_policy/queue_workers.py @@ -23,7 +23,7 @@ from typing import Dict, List, Optional, Any, Tuple, Union from . import utils as sp_utils from .process_runners import run_command, run_docker_container, get_worker_id -from .workers import get_auth_manager +from .worker_utils import get_auth_manager from .queue_provider import RedisQueueProvider logger = logging.getLogger(__name__) diff --git a/ytops_client-source/ytops_client/stress_policy/throughput_worker.py b/ytops_client-source/ytops_client/stress_policy/throughput_worker.py new file mode 100644 index 0000000..25bcb95 --- /dev/null +++ b/ytops_client-source/ytops_client/stress_policy/throughput_worker.py @@ -0,0 +1,158 @@ +import collections +import json +import logging +import os +import random +import re +import shlex +import sys +import tempfile +import shutil +import subprocess +import threading +import time +from copy import deepcopy +from datetime import datetime, timezone +from pathlib import Path + +from . import utils as sp_utils +from .process_runners import run_command, run_docker_container, get_worker_id +from ..profile_manager_tool import ProfileManager +from .worker_utils import find_task_and_lock_profile, _run_download_logic + + +logger = logging.getLogger(__name__) + + +def run_throughput_worker(worker_id, policy, state_manager, args, profile_manager_instance, running_processes, process_lock, profile_prefix=None): + """A persistent worker for the 'throughput' orchestration mode.""" + owner_id = f"throughput-worker-{worker_id}" + settings = policy.get('settings', {}) + exec_control = policy.get('execution_control', {}) + + # Prioritize the passed-in profile_prefix for worker pool compatibility. + if not profile_prefix: + d_policy = policy.get('download_policy', {}) + profile_prefix = d_policy.get('profile_prefix') + + if not profile_prefix: + logger.error(f"[Worker {worker_id}] Throughput mode requires a 'profile_prefix' from the worker pool or 'download_policy'. Worker exiting.") + return [] + + no_task_streak = 0 + + while not state_manager.shutdown_event.is_set(): + locked_profile = None + claimed_task_path = None + try: + # 0. If no tasks were found previously, pause briefly. + if no_task_streak > 0: + polling_interval = exec_control.get('worker_polling_interval_seconds', 1) + logger.info(f"[Worker {worker_id}] No tasks found in previous attempt(s). Pausing for {polling_interval}s. (Streak: {no_task_streak})") + time.sleep(polling_interval) + if state_manager.shutdown_event.is_set(): continue + + # 1. Find a task and lock its associated profile + locked_profile, claimed_task_path = find_task_and_lock_profile( + profile_manager_instance, owner_id, profile_prefix, policy, worker_id + ) + + if not locked_profile: + # No task/profile combo was available. + no_task_streak += 1 + polling_interval = exec_control.get('worker_polling_interval_seconds', 1) + logger.info(f"[Worker {worker_id}] No available tasks found for any active profiles. Pausing for {polling_interval}s.") + time.sleep(polling_interval) + continue + + profile_name = locked_profile['name'] + + # We have a task and a lock. + if claimed_task_path: + no_task_streak = 0 # Reset streak + # 3. Process the task + try: + with open(claimed_task_path, 'r', encoding='utf-8') as f: + info_json_content = f.read() + except (IOError, FileNotFoundError) as e: + logger.error(f"[{sp_utils.get_display_name(claimed_task_path)}] Could not read claimed task file: {e}") + # Unlock profile and continue, file might be corrupted + profile_manager_instance.unlock_profile(profile_name, owner=owner_id) + locked_profile = None + # Clean up the bad file + try: claimed_task_path.unlink() + except OSError: pass + continue + + # The locked profile's proxy MUST be used for the download. + local_policy = deepcopy(policy) + local_policy.setdefault('download_policy', {})['proxy'] = locked_profile['proxy'] + + _run_download_logic( + source=claimed_task_path, + info_json_content=info_json_content, + policy=local_policy, + state_manager=state_manager, + args=args, + running_processes=running_processes, + process_lock=process_lock, + profile_name=profile_name, + profile_manager_instance=profile_manager_instance + ) + + # 4. Clean up the processed task file + try: + os.remove(claimed_task_path) + logger.debug(f"[{sp_utils.get_display_name(claimed_task_path)}] Removed processed task file.") + except OSError as e: + logger.error(f"Failed to remove processed task file '{claimed_task_path}': {e}") + else: + # This case should not be reached with the new task-first locking logic. + # If it is, it means find_task_and_lock_profile returned a profile but no task. + logger.warning(f"[Worker {worker_id}] Inconsistent state: locked profile '{profile_name}' but no task was claimed. Unlocking and continuing.") + + except Exception as e: + logger.error(f"[Worker {worker_id}] An unexpected error occurred in the worker loop: {e}", exc_info=True) + time.sleep(5) # Pause before retrying to avoid spamming errors + finally: + if locked_profile: + # 5. Unlock the profile. Only apply cooldown if a task was processed. + cooldown = None + if claimed_task_path: + # Enforcer is the only point where we configure to apply different policies, + # since we might restart enforcer, but won't restart stress-policy working on auth and downloads simultaneously. + # This is like applying a policy across multiple workers/machines without needing to restart each of them. + # DESIGN: The cooldown duration is not configured in the worker's policy. + # Instead, it is read from a central Redis key. This key is set by the + # policy-enforcer, making the enforcer the single source of truth for + # this policy. This allows changing the cooldown behavior without + # restarting the workers. + cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') + if cooldown_config: + try: + val = json.loads(cooldown_config) + if isinstance(val, list) and len(val) == 2 and val[0] < val[1]: + cooldown = random.randint(val[0], val[1]) + elif isinstance(val, int): + cooldown = val + except (json.JSONDecodeError, TypeError): + if cooldown_config.isdigit(): + cooldown = int(cooldown_config) + + if cooldown: + logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") + + profile_manager_instance.unlock_profile( + locked_profile['name'], + owner=owner_id, + rest_for_seconds=cooldown + ) + locked_profile = None + + # 6. Throughput is now controlled by the enforcer via the profile's + # 'unlock_cooldown_seconds' policy, which puts the profile into a + # RESTING state. The worker does not need to sleep here and can + # immediately try to lock a new profile to maximize throughput. + + logger.info(f"[Worker {worker_id}] Worker loop finished.") + return [] # This function doesn't return results directly \ No newline at end of file diff --git a/ytops_client-source/ytops_client/stress_policy/worker_utils.py b/ytops_client-source/ytops_client/stress_policy/worker_utils.py new file mode 100644 index 0000000..ec1e63a --- /dev/null +++ b/ytops_client-source/ytops_client/stress_policy/worker_utils.py @@ -0,0 +1,1004 @@ +import collections +import json +import logging +import os +import random +import re +import shlex +import sys +import tempfile +import shutil +import subprocess +import threading +import time +from copy import deepcopy +from datetime import datetime, timezone +from pathlib import Path + +from . import utils as sp_utils +from .process_runners import run_command, get_worker_id +from ..profile_manager_tool import ProfileManager + + +logger = logging.getLogger(__name__) + +# --- Auth Profile Manager Cache --- +# This is a cache to hold a ProfileManager instance for the auth simulation. +# It's needed so that download workers can decrement the correct pending download counter. +_auth_manager_cache = {} +_auth_manager_lock = threading.Lock() + +def get_auth_manager(current_manager, auth_env: str): + """ + Gets a ProfileManager instance for a specific auth simulation environment. + It uses the auth_env provided from the info.json metadata. + """ + with _auth_manager_lock: + if not auth_env: + return None + + if auth_env in _auth_manager_cache: + return _auth_manager_cache[auth_env] + + logger.info(f"Creating new ProfileManager for auth simulation env: '{auth_env}'") + try: + # Re-use connection settings from the current manager + redis_conn_kwargs = current_manager.redis.connection_pool.connection_kwargs + auth_key_prefix = f"{auth_env}_profile_mgmt_" + + auth_manager = ProfileManager( + redis_host=redis_conn_kwargs.get('host'), + redis_port=redis_conn_kwargs.get('port'), + redis_password=redis_conn_kwargs.get('password'), + redis_db=redis_conn_kwargs.get('db'), + key_prefix=auth_key_prefix + ) + _auth_manager_cache[auth_env] = auth_manager + return auth_manager + except Exception as e: + logger.error(f"Failed to create ProfileManager for auth env '{auth_env}': {e}") + return None + +def _run_ffprobe(media_path, output_path): + """Runs ffprobe on the media file and saves the JSON output.""" + try: + ffprobe_cmd = [ + 'ffprobe', + '-v', 'quiet', + '-print_format', 'json', + '-show_format', + '-show_streams', + str(media_path) + ] + result = subprocess.run(ffprobe_cmd, check=True, capture_output=True, text=True, encoding='utf-8') + + with open(output_path, 'w', encoding='utf-8') as f: + f.write(result.stdout) + logger.info(f"Successfully generated ffprobe JSON for '{media_path.name}' at '{output_path}'") + return True + except FileNotFoundError: + logger.error("ffprobe command not found. Please ensure ffprobe is installed and in your PATH.") + return False + except subprocess.CalledProcessError as e: + logger.error(f"ffprobe failed for '{media_path.name}': {e.stderr}") + return False + except Exception as e: + logger.error(f"An error occurred while running ffprobe for '{media_path.name}': {e}") + return False + +def _cleanup_media_file(media_path): + """Replaces the media file with an empty file ending in .empty.""" + try: + empty_path = Path(str(media_path) + '.empty') + empty_path.touch() + os.remove(media_path) + logger.info(f"Cleaned up media file '{media_path.name}', replaced with '{empty_path.name}'") + except Exception as e: + logger.error(f"Failed to cleanup media file '{media_path.name}': {e}") + +def _post_process_and_move_info_json(file_path, profile_name, proxy_url, policy, worker_id, state_manager, profile_manager_instance=None): + """Helper to post-process a single info.json file and move it to the final directory.""" + direct_policy = policy.get('direct_docker_cli_policy', {}) + settings = policy.get('settings', {}) + save_dir = settings.get('save_info_json_dir') + if not save_dir: + return False + + video_id = "unknown" + try: + # Use a short delay and retry mechanism to handle cases where the file is not yet fully written. + for attempt in range(3): + try: + with open(file_path, 'r+', encoding='utf-8') as f: + info_data = json.load(f) + video_id = info_data.get('id', 'unknown') + env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') if profile_manager_instance else 'unknown' + info_data['_ytops_metadata'] = { + 'profile_name': profile_name, + 'proxy_url': proxy_url, + 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), + 'auth_env': env_name + } + f.seek(0) + json.dump(info_data, f, indent=2) + f.truncate() + break # Success + except (json.JSONDecodeError, IOError) as e: + if attempt < 2: + time.sleep(0.2) + else: + raise e + + final_path = Path(save_dir) / file_path.name + rename_template = direct_policy.get('rename_file_template') + if rename_template: + sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) + new_name = rename_template.format( + video_id=video_id, profile_name=profile_name, proxy=sanitized_proxy + ) + final_path = Path(save_dir) / new_name + + # Use shutil.move, which can handle cross-device moves (e.g., to an S3 mount) + # by falling back to a copy-and-delete operation. + shutil.move(str(file_path), str(final_path)) + logger.info(f"[Worker {worker_id}] Post-processed and moved info.json to '{final_path}'") + + # --- Create download task if in queue mode --- + queue_policy = policy.get('queue_policy', {}) + formats_to_download = queue_policy.get('formats_to_download') + + if formats_to_download and state_manager and state_manager.queue_provider: + try: + url = info_data.get('original_url') or info_data.get('webpage_url') + + download_task = { + 'info_json_path': str(final_path), + 'video_id': video_id, + 'url': url, + 'auth_profile_name': profile_name, + 'proxy_url': proxy_url, + 'auth_env': env_name, + } + + added_count = state_manager.add_download_tasks_batch([download_task]) + if added_count > 0: + logger.info(f"[Worker {worker_id}] [{profile_name}] Added {added_count} download task(s) to queue for {video_id}") + else: + logger.error(f"[Worker {worker_id}] [{profile_name}] Failed to add download task to queue for {video_id}") + return False + + except Exception as e: + logger.error(f"[Worker {worker_id}] [{profile_name}] Failed to create download task for {video_id}: {e}", exc_info=True) + return False + + return True + except (IOError, json.JSONDecodeError, OSError) as e: + logger.error(f"[Worker {worker_id}] Error post-processing '{file_path.name}' (video: {video_id}): {e}") + return False + + +def find_task_and_lock_profile(profile_manager, owner_id, profile_prefix, policy, worker_id): + """ + Scans for an available task and locks the specific ACTIVE profile that generated it. + This preserves a 1-to-1 relationship between a profile and its tasks. + Returns a tuple of (locked_profile_dict, claimed_task_path_obj) or (None, None). + """ + settings = policy.get('settings', {}) + info_json_dir = settings.get('info_json_dir') + if not info_json_dir: + return None, None + + logger.info(f"[Worker {worker_id}] Scanning for tasks in '{info_json_dir}'...") + + # 1. Get all available task files and group by profile + try: + task_files = list(Path(info_json_dir).rglob('*.json')) + except FileNotFoundError: + logger.warning(f"Info JSON directory not found during scan: {info_json_dir}") + return None, None + + if not task_files: + return None, None + + profile_regex_str = settings.get('profile_extraction_regex') + if not profile_regex_str: + logger.error(f"[Worker {worker_id}] The task-locking strategy requires 'settings.profile_extraction_regex' to be defined in the policy.") + return None, None + try: + profile_regex = re.compile(profile_regex_str) + except re.error as e: + logger.error(f"Invalid profile_extraction_regex in policy: '{profile_regex_str}'. Error: {e}") + return None, None + + tasks_by_profile = collections.defaultdict(list) + for task_path in task_files: + match = profile_regex.search(task_path.name) + if match and match.groups(): + profile_name = match.group(1) + tasks_by_profile[profile_name].append(task_path) + + if not tasks_by_profile: + logger.debug(f"[Worker {worker_id}] Found task files, but could not extract any profile names from them.") + return None, None + + # 2. Get ACTIVE profiles from Redis. + active_profiles = profile_manager.list_profiles(state_filter='ACTIVE') + active_profile_names = {p['name'] for p in active_profiles if p['name'].startswith(profile_prefix or '')} + + # 3. Find profiles that are both ACTIVE and have tasks. + candidate_profiles = list(active_profile_names.intersection(tasks_by_profile.keys())) + + if not candidate_profiles: + logger.debug(f"[Worker {worker_id}] Found tasks for profiles: {list(tasks_by_profile.keys())}, but none are currently ACTIVE.") + return None, None + + # 4. Shuffle candidates and try to lock one. + random.shuffle(candidate_profiles) + + for profile_name in candidate_profiles: + # Try to lock the profile. + # The owner_id already contains the worker_id, so the log message from profile_manager will be informative enough. + locked_profile = profile_manager.lock_profile(owner=owner_id, specific_profile_name=profile_name) + if locked_profile: + # Success! Claim one of its task files. + # Shuffle the tasks for this profile to avoid workers grabbing the same one. + profile_tasks = tasks_by_profile[profile_name] + random.shuffle(profile_tasks) + + for task_path in profile_tasks: + locked_path = task_path.with_suffix(f"{task_path.suffix}.LOCKED.{worker_id}") + try: + task_path.rename(locked_path) + logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' and claimed its task '{task_path.name}'.") + return locked_profile, locked_path + except FileNotFoundError: + # This specific task was claimed by another worker between our scan and now. + # This is rare but possible if two workers lock the same profile and try to claim tasks. + # Let's just try the next task for this profile. + logger.warning(f"[Worker {worker_id}] Task '{task_path.name}' was claimed by another worker. Trying another task for '{profile_name}'.") + continue + except OSError as e: + logger.error(f"[Worker {worker_id}] Error claiming task file '{task_path.name}': {e}") + # Something is wrong with this file, try the next one. + continue + + # If we are here, we locked a profile but failed to claim any of its tasks. + # This is a weird state. We should unlock and move on. + logger.warning(f"[Worker {worker_id}] Locked profile '{profile_name}' but failed to claim any of its tasks. Unlocking.") + profile_manager.unlock_profile(profile_name, owner=owner_id) + + # No suitable task/profile combo found. + logger.debug(f"[Worker {worker_id}] Could not lock any of the {len(candidate_profiles)} candidate profiles (all may have been locked by other workers).") + return None, None + +def _run_download_logic(source, info_json_content, policy, state_manager, args, running_processes, process_lock, profile_name=None, profile_manager_instance=None): + """Shared download logic for a single info.json.""" + proxy_url = None + if info_json_content: + try: + info_data = json.loads(info_json_content) + proxy_url = info_data.get('_proxy_url') + except (json.JSONDecodeError, AttributeError): + logger.warning(f"[{sp_utils.get_display_name(source)}] Could not parse info.json to get proxy for download controls.") + + d_policy = policy.get('download_policy', {}) + temp_download_dir = None + local_policy = policy + + if d_policy.get('output_to_airflow_ready_dir'): + local_policy = deepcopy(policy) + temp_download_dir = tempfile.mkdtemp(prefix='stress-dl-video-') + # This modification is safe because it's on a deep copy + local_policy.setdefault('download_policy', {})['output_dir'] = temp_download_dir + logger.info(f"[{sp_utils.get_display_name(source)}] Using temporary download directory: {temp_download_dir}") + + try: + if not state_manager.check_and_update_download_rate_limit(proxy_url, local_policy): + return [] + + state_manager.wait_for_proxy_cooldown(proxy_url, local_policy) + results = process_info_json_cycle(source, info_json_content, local_policy, state_manager, args, running_processes, process_lock, proxy_url=proxy_url, profile_name=profile_name, +profile_manager_instance=profile_manager_instance) + state_manager.update_proxy_finish_time(proxy_url) + + # --- Post-download logic for ffprobe, Airflow dir, and cleanup --- + for result in results: + if result.get('success') and result.get('downloaded_filepath'): + downloaded_filepath_str = result.get('downloaded_filepath') + if not downloaded_filepath_str or not os.path.exists(downloaded_filepath_str): + continue + + media_path = Path(downloaded_filepath_str) + + # 1. Run ffprobe on the file at its current location + if d_policy.get('run_ffprobe'): + ffprobe_output_path = None + # The source is the path to the task/info.json file. + if isinstance(source, Path): + # Name it like the info.json but with .ffprobe.json + ffprobe_output_path = source.with_name(f"{source.stem}.ffprobe.json") + else: + # Fallback if source is not a path (e.g. from memory) + video_id = result.get('video_id', 'unknown_video') + # Save it next to the media file + ffprobe_output_path = media_path.with_name(f"{video_id}.ffprobe.json") + + _run_ffprobe(media_path, ffprobe_output_path) + + # 2. Move to Airflow directory if configured. This updates media_path. + if d_policy.get('output_to_airflow_ready_dir'): + try: + video_id = result.get('video_id') + if not video_id: + try: + info_data = json.loads(info_json_content) + video_id = info_data.get('id') + except (json.JSONDecodeError, AttributeError): + video_id = None + + if not video_id: + logger.error(f"[{sp_utils.get_display_name(source)}] Could not find video ID in result for moving file.") + else: + now = datetime.now() + rounded_minute = (now.minute // 10) * 10 + timestamp_str = now.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}" + + base_path = d_policy.get('airflow_ready_dir_base_path', 'downloadfiles/videos/ready') + if not os.path.isabs(base_path): + base_path = os.path.join(sp_utils._PROJECT_ROOT, base_path) + final_dir_base = os.path.join(base_path, timestamp_str) + final_dir_path = os.path.join(final_dir_base, video_id) + + os.makedirs(final_dir_path, exist_ok=True) + + if os.path.exists(media_path): + final_media_path = Path(final_dir_path) / media_path.name + shutil.move(str(media_path), str(final_media_path)) + logger.info(f"[{sp_utils.get_display_name(source)}] Moved media file to {final_media_path}") + media_path = final_media_path + + if isinstance(source, Path) and source.exists(): + new_info_json_name = f"info_{video_id}.json" + dest_info_json_path = os.path.join(final_dir_path, new_info_json_name) + shutil.copy(source, dest_info_json_path) + logger.info(f"[{sp_utils.get_display_name(source)}] Copied info.json to {dest_info_json_path}") + except Exception as e: + logger.error(f"[{sp_utils.get_display_name(source)}] Failed to move downloaded file to Airflow ready directory: {e}") + + # 3. Cleanup the file at its final location + if d_policy.get('cleanup'): + if os.path.exists(media_path): + _cleanup_media_file(media_path) + + return results + finally: + if temp_download_dir and os.path.exists(temp_download_dir): + shutil.rmtree(temp_download_dir) + logger.info(f"[{sp_utils.get_display_name(source)}] Cleaned up temporary directory: {temp_download_dir}") + + +def process_profile_task(profile_name, file_list, policy, state_manager, cycle_num, args, running_processes, process_lock, profile_manager_instance=None): + """Worker task for a profile, processing its files sequentially.""" + logger.info(f"Worker {get_worker_id()} starting task for profile '{profile_name}' with {len(file_list)} files.") + all_results = [] + for i, file_path in enumerate(file_list): + if state_manager.shutdown_event.is_set(): + logger.info(f"Shutdown requested, stopping task for profile '{profile_name}'.") + break + + try: + with open(file_path, 'r', encoding='utf-8') as f: + info_json_content = f.read() + except (IOError, FileNotFoundError) as e: + logger.error(f"[{sp_utils.get_display_name(file_path)}] Could not read info.json file: {e}") + continue # Skip this file + + results_for_file = _run_download_logic(file_path, info_json_content, policy, state_manager, args, running_processes, process_lock, profile_name=profile_name, profile_manager_instance=profile_manager_instance) + all_results.extend(results_for_file) + + # Mark file as processed if configured. This works for both 'once' and 'continuous' modes. + settings = policy.get('settings', {}) + if settings.get('directory_scan_mode') == 'continuous': + state_manager.mark_file_as_processed(file_path) + + if settings.get('mark_processed_files'): + try: + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + new_path = file_path.parent / f"{file_path.name}.{timestamp}.processed" + file_path.rename(new_path) + logger.info(f"Marked '{file_path.name}' as processed by renaming to '{new_path.name}'") + except (IOError, OSError) as e: + logger.error(f"Failed to rename processed file '{file_path.name}': {e}") + + # Check for stop conditions after processing each file + should_stop_profile = False + for result in results_for_file: + if not result['success']: + s_conditions = policy.get('stop_conditions', {}) + if s_conditions.get('on_failure') or \ + (s_conditions.get('on_http_403') and result['error_type'] == 'HTTP 403') or \ + (s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'): + logger.info(f"Stopping further processing for profile '{profile_name}' due to failure.") + should_stop_profile = True + break + if should_stop_profile: + break + + # Apply sleep between tasks for this profile + if i < len(file_list) - 1: + exec_control = policy.get('execution_control', {}) + sleep_cfg = exec_control.get('sleep_between_tasks', {}) + sleep_min = sleep_cfg.get('min_seconds', 0) + sleep_max_val = sleep_cfg.get('max_seconds') + + if sleep_min > 0 or sleep_max_val is not None: + sleep_max = sleep_min if sleep_max_val is None else sleep_max_val + + sleep_duration = 0 + if sleep_max < sleep_min: + logger.warning(f"sleep_between_tasks: max_seconds ({sleep_max}s) is less than min_seconds ({sleep_min}s). Using max_seconds as fixed sleep duration.") + sleep_duration = sleep_max + elif sleep_max > sleep_min: + sleep_duration = random.uniform(sleep_min, sleep_max) + else: # equal + sleep_duration = sleep_min + + if sleep_duration > 0: + logger.debug(f"Profile '{profile_name}' sleeping for {sleep_duration:.2f}s before next file.") + # Interruptible sleep + sleep_end_time = time.time() + sleep_duration + while time.time() < sleep_end_time: + if state_manager.shutdown_event.is_set(): + break + time.sleep(0.2) + + return all_results + + +def run_download_worker(info_json_path, info_json_content, format_to_download, policy, args, running_processes, process_lock, state_manager, profile_name=None): + """ + Performs a single download attempt. Designed to be run in a worker thread. + """ + worker_id = get_worker_id() + display_name = sp_utils.get_display_name(info_json_path) + profile_log_part = f" [Profile: {profile_name}]" if profile_name else "" + log_prefix = f"[Worker {worker_id}]{profile_log_part} [{display_name} @ {format_to_download}]" + + download_policy = policy.get('download_policy', {}) + settings = policy.get('settings', {}) + downloader = download_policy.get('downloader') + + # Get script command from settings, with fallback to download_policy for old format. + script_cmd_str = settings.get('download_script') + if not script_cmd_str: + script_cmd_str = download_policy.get('script') + + if script_cmd_str: + download_cmd = shlex.split(script_cmd_str) + elif downloader == 'aria2c_rpc': + download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'aria-rpc'] + elif downloader == 'native-cli': + download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'cli'] + else: + # Default to the new native-py downloader if downloader is 'native-py' or not specified. + download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'py'] + + download_cmd.extend(['-f', format_to_download]) + + if downloader == 'aria2c_rpc': + if download_policy.get('aria_host'): + download_cmd.extend(['--aria-host', str(download_policy['aria_host'])]) + if download_policy.get('aria_port'): + download_cmd.extend(['--aria-port', str(download_policy['aria_port'])]) + if download_policy.get('aria_secret'): + download_cmd.extend(['--aria-secret', str(download_policy['aria_secret'])]) + if download_policy.get('output_dir'): + download_cmd.extend(['--output-dir', str(download_policy['output_dir'])]) + if download_policy.get('aria_remote_dir'): + download_cmd.extend(['--remote-dir', str(download_policy['aria_remote_dir'])]) + if download_policy.get('aria_fragments_dir'): + download_cmd.extend(['--fragments-dir', str(download_policy['aria_fragments_dir'])]) + # For stress testing, waiting is the desired default to get a success/fail result. + # Allow disabling it by explicitly setting aria_wait: false in the policy. + if download_policy.get('aria_wait', True): + download_cmd.append('--wait') + + if download_policy.get('auto_merge_fragments'): + download_cmd.append('--auto-merge-fragments') + if download_policy.get('remove_fragments_after_merge'): + download_cmd.append('--remove-fragments-after-merge') + if download_policy.get('purge_on_complete'): + download_cmd.append('--purge-on-complete') + + downloader_args = download_policy.get('downloader_args') + proxy = download_policy.get('proxy') + if proxy: + # Note: proxy_rename is not supported for aria2c_rpc mode. + proxy_arg = f"--all-proxy {shlex.quote(str(proxy))}" + if downloader_args: + downloader_args = f"{downloader_args} {proxy_arg}" + else: + downloader_args = proxy_arg + + if downloader_args: + # For aria2c_rpc, the downloader_args value is passed directly to the script's --downloader-args option. + download_cmd.extend(['--downloader-args', downloader_args]) + elif downloader == 'native-cli': + # This is the logic for the legacy download_tool.py (yt-dlp CLI wrapper). + pause_seconds = download_policy.get('pause_before_download_seconds') + if pause_seconds and isinstance(pause_seconds, (int, float)) and pause_seconds > 0: + download_cmd.extend(['--pause', str(pause_seconds)]) + + if download_policy.get('continue_downloads'): + download_cmd.append('--download-continue') + + # Add proxy if specified directly in the policy + proxy = download_policy.get('proxy') + if proxy: + download_cmd.extend(['--proxy', str(proxy)]) + + proxy_rename = download_policy.get('proxy_rename') + if proxy_rename: + download_cmd.extend(['--proxy-rename', str(proxy_rename)]) + + extra_args = download_policy.get('extra_args') + if extra_args: + download_cmd.extend(shlex.split(extra_args)) + + # Note: 'downloader' here refers to yt-dlp's internal downloader, not our script. + # The policy key 'external_downloader' is more clear, but we support 'downloader' for backward compatibility. + ext_downloader = download_policy.get('external_downloader') or download_policy.get('downloader') + if ext_downloader and ext_downloader not in ['native-cli', 'native-py', 'aria2c_rpc']: + download_cmd.extend(['--downloader', str(ext_downloader)]) + + downloader_args = download_policy.get('downloader_args') + if downloader_args: + download_cmd.extend(['--downloader-args', str(downloader_args)]) + + if download_policy.get('merge_output_format'): + download_cmd.extend(['--merge-output-format', str(download_policy['merge_output_format'])]) + + if download_policy.get('merge_output_format'): + download_cmd.extend(['--merge-output-format', str(download_policy['merge_output_format'])]) + + else: + # This is the default logic for the new native-py downloader. + if download_policy.get('output_to_buffer'): + download_cmd.append('--output-buffer') + else: + # --output-dir is only relevant if not outputting to buffer. + if download_policy.get('output_dir'): + download_cmd.extend(['--output-dir', str(download_policy['output_dir'])]) + + if download_policy.get('config'): + download_cmd.extend(['--config', str(download_policy['config'])]) + + if download_policy.get('temp_path'): + download_cmd.extend(['--temp-path', str(download_policy['temp_path'])]) + if download_policy.get('continue_downloads'): + download_cmd.append('--download-continue') + + pause_seconds = download_policy.get('pause_before_download_seconds') + if pause_seconds and isinstance(pause_seconds, (int, float)) and pause_seconds > 0: + download_cmd.extend(['--pause', str(pause_seconds)]) + + proxy = download_policy.get('proxy') + if proxy: + download_cmd.extend(['--proxy', str(proxy)]) + + proxy_rename = download_policy.get('proxy_rename') + if proxy_rename: + download_cmd.extend(['--proxy-rename', str(proxy_rename)]) + + # The 'extra_args' from the policy are for the download script itself, not for yt-dlp. + # We need to split them and add them to the command. + extra_args = download_policy.get('extra_args') + if extra_args: + download_cmd.extend(shlex.split(extra_args)) + + # Pass through downloader settings for yt-dlp to use + # e.g. to tell yt-dlp to use aria2c as its backend + ext_downloader = download_policy.get('external_downloader') + if ext_downloader: + download_cmd.extend(['--downloader', str(ext_downloader)]) + + downloader_args = download_policy.get('downloader_args') + if downloader_args: + download_cmd.extend(['--downloader-args', str(downloader_args)]) + + if args.dummy: + # Create a copy to add the info.json path for logging, without modifying the original + log_cmd = list(download_cmd) + if isinstance(info_json_path, Path) and info_json_path.exists(): + log_cmd.extend(['--load-info-json', str(info_json_path)]) + else: + log_cmd.extend(['--load-info-json', '']) + + logger.info(f"{log_prefix} Dummy mode: simulating download...") + logger.info(f"{log_prefix} Dummy mode: Would run command: {' '.join(shlex.quote(s) for s in log_cmd)}") + + dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) + min_seconds = dummy_settings.get('download_min_seconds', 1.0) + max_seconds = dummy_settings.get('download_max_seconds', 3.0) + failure_rate = dummy_settings.get('download_failure_rate', 0.0) + skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0) + + sleep_duration = random.uniform(min_seconds, max_seconds) + logger.info(f"{log_prefix} Dummy mode: simulating download for {sleep_duration:.2f}s (from policy range {min_seconds}-{max_seconds}s).") + time.sleep(sleep_duration) # Simulate work + + rand_val = random.random() + should_fail_skipped = rand_val < skipped_rate + should_fail_fatal = not should_fail_skipped and rand_val < (skipped_rate + failure_rate) + + if should_fail_skipped: + logger.warning(f"{log_prefix} Dummy mode: Injecting simulated skipped download failure.") + return { + 'type': 'download', + 'path': str(info_json_path), + 'format': format_to_download, + 'success': False, + 'error_type': 'DummySkippedFailure', + 'details': 'FAIL (Dummy mode, skipped)', + 'downloaded_bytes': 0, + 'profile': profile_name, + 'downloaded_filepath': None, + 'is_tolerated_error': True + } + + if should_fail_fatal: + logger.warning(f"{log_prefix} Dummy mode: Injecting simulated fatal download failure.") + return { + 'type': 'download', + 'path': str(info_json_path), + 'format': format_to_download, + 'success': False, + 'error_type': 'DummyFailure', + 'details': 'FAIL (Dummy mode, fatal)', + 'downloaded_bytes': 0, + 'profile': profile_name, + 'downloaded_filepath': None + } + + downloaded_filepath = f'/dev/null/{display_name}.mp4' + if download_policy.get('output_to_airflow_ready_dir'): + output_dir = download_policy.get('output_dir') + if output_dir and os.path.isdir(output_dir): + try: + dummy_path_obj = Path(output_dir) / f"{display_name}.mp4" + dummy_path_obj.touch() + downloaded_filepath = str(dummy_path_obj) + logger.info(f"{log_prefix} Dummy mode: created dummy file for Airflow move: {downloaded_filepath}") + except OSError as e: + logger.error(f"{log_prefix} Dummy mode: failed to create dummy file in '{output_dir}': {e}") + + return { + 'type': 'download', + 'path': str(info_json_path), + 'format': format_to_download, + 'success': True, + 'error_type': None, + 'details': 'OK (Dummy mode)', + 'downloaded_bytes': random.randint(100000, 5000000), + 'profile': profile_name, + 'downloaded_filepath': downloaded_filepath + } + + logger.info(f"{log_prefix} Kicking off download process...") + + temp_info_file_path = None + try: + if isinstance(info_json_path, Path) and info_json_path.exists(): + # The info.json is already in a file, pass its path directly. + download_cmd.extend(['--load-info-json', str(info_json_path)]) + else: + # The info.json content is in memory, so write it to a temporary file. + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', encoding='utf-8') as temp_f: + temp_f.write(info_json_content) + temp_info_file_path = temp_f.name + download_cmd.extend(['--load-info-json', temp_info_file_path]) + + cmd_str_for_log = ' '.join(shlex.quote(s) for s in download_cmd) + logger.info(f"{log_prefix} Running download command: {cmd_str_for_log}") + output_to_buffer = download_policy.get('output_to_buffer', False) + retcode, stdout, stderr = run_command( + download_cmd, + running_processes, + process_lock, + binary_stdout=output_to_buffer, + stream_output=getattr(args, 'print_downloader_log', False), + stream_prefix=f"{log_prefix} | " + ) + finally: + if temp_info_file_path and os.path.exists(temp_info_file_path): + os.unlink(temp_info_file_path) + + is_403_error = "HTTP Error 403" in stderr + is_timeout_error = "Read timed out" in stderr + output_to_buffer = download_policy.get('output_to_buffer', False) + + # Parse stdout to find the downloaded file path. + # The download scripts print the final path, sometimes with a prefix. + downloaded_filepath = None + if stdout and not output_to_buffer: + lines = stdout.strip().split('\n') + if lines: + last_line = lines[-1].strip() + + # Handle aria-rpc output format: "Download... successful: " + aria_match = re.search(r'successful: (.+)', last_line) + if aria_match: + path_from_aria = aria_match.group(1).strip() + if os.path.exists(path_from_aria): + downloaded_filepath = path_from_aria + else: + logger.warning(f"[{display_name}] Path from aria-rpc output does not exist: '{path_from_aria}'") + # Handle native-py/cli output format (just the path) + elif os.path.exists(last_line): + downloaded_filepath = last_line + + result = { + 'type': 'download', + 'path': str(info_json_path), + 'format': format_to_download, + 'success': retcode == 0, + 'error_type': None, + 'details': '', + 'downloaded_bytes': 0, + 'profile': profile_name, + 'downloaded_filepath': downloaded_filepath + } + + if retcode == 0: + details_str = "OK" + size_in_bytes = 0 + if output_to_buffer: + # The most accurate size is the length of the stdout buffer. + size_in_bytes = len(stdout) # stdout is bytes + details_str += f" (Buffered {sp_utils.format_size(size_in_bytes)})" + else: + size_match = re.search(r'\[download\]\s+100%\s+of\s+~?([0-9.]+)(B|KiB|MiB|GiB)', stderr) + if size_match: + value = float(size_match.group(1)) + unit = size_match.group(2) + multipliers = {"B": 1, "KiB": 1024, "MiB": 1024**2, "GiB": 1024**3} + size_in_bytes = int(value * multipliers.get(unit, 1)) + details_str += f" ({size_match.group(1)}{unit})" + + result['downloaded_bytes'] = size_in_bytes + result['details'] = details_str + else: + # Check for shutdown first. This is the most likely cause for an abrupt non-zero exit. + if state_manager.shutdown_event.is_set(): + result['error_type'] = 'Cancelled' + result['details'] = 'Task cancelled during shutdown.' + else: + # Check both stdout and stderr for error messages, as logging might be directed to stdout. + full_output = f"{stdout}\n{stderr}" + # Look for common error indicators from both yt-dlp and our scripts. + error_lines = [ + line for line in full_output.strip().split('\n') + if 'ERROR:' in line or ' - ERROR - ' in line or 'DownloadError:' in line or 'Traceback' in line + ] + if error_lines: + # Try to get the most specific part of the error. + details = error_lines[-1].strip() + # Our log format is "timestamp - logger - LEVEL - message" + if ' - ERROR - ' in details: + details = details.split(' - ERROR - ', 1)[-1] + result['details'] = details + else: + # Fallback to last non-empty line of stderr if no explicit "ERROR" line found + stderr_lines = [line for line in stderr.strip().split('\n') if line.strip()] + result['details'] = stderr_lines[-1].strip() if stderr_lines else "Unknown error" + + if is_403_error: + result['error_type'] = 'HTTP 403' + elif is_timeout_error: + result['error_type'] = 'Timeout' + else: + result['error_type'] = f'Exit Code {retcode}' + + return result + + +def process_info_json_cycle(path, content, policy, state_manager, args, running_processes, process_lock, proxy_url=None, profile_name=None, profile_manager_instance=None): + """ + Processes one info.json file for one cycle, downloading selected formats. + """ + results = [] + display_name = sp_utils.get_display_name(path) + d_policy = policy.get('download_policy', {}) + s_conditions = policy.get('stop_conditions', {}) + + try: + info_data = json.loads(content) + + # If the task file specifies a format, use it instead of the policy's format list. + # This is for granular tasks generated by the task-generator tool. + if '_ytops_download_format' in info_data: + format_selection = info_data['_ytops_download_format'] + logger.info(f"[{display_name}] Using format '{format_selection}' from task file.") + else: + format_selection = d_policy.get('formats', '') + available_formats = [f['format_id'] for f in info_data.get('formats', [])] + if not available_formats: + logger.warning(f"[{display_name}] No formats found in info.json. Skipping.") + return [] + + formats_to_test = [] + if format_selection == 'all': + formats_to_test = available_formats + elif format_selection.startswith('random:'): + percent = float(format_selection.split(':')[1].rstrip('%')) + count = max(1, int(len(available_formats) * (percent / 100.0))) + formats_to_test = random.sample(available_formats, k=count) + elif format_selection.startswith('random_from:'): + choices = [f.strip() for f in format_selection.split(':', 1)[1].split(',')] + valid_choices = [f for f in choices if f in available_formats] + if valid_choices: + formats_to_test = [random.choice(valid_choices)] + else: + # If the format selection contains complex selector characters (other than comma), + # treat the entire string as a single format selector for yt-dlp to interpret. + # Otherwise, split by comma to test each specified format ID individually. + if any(c in format_selection for c in '/+[]()'): + requested_formats = [format_selection] + else: + requested_formats = [f.strip() for f in format_selection.split(',') if f.strip()] + + formats_to_test = [] + selector_keywords = ('best', 'worst', 'bestvideo', 'bestaudio') + + for req_fmt in requested_formats: + # Treat as a selector and pass through if it contains special characters + # or starts with a known selector keyword. + if any(c in req_fmt for c in '/+[]()') or req_fmt.startswith(selector_keywords): + formats_to_test.append(req_fmt) + continue + + # Otherwise, treat as a specific format ID that must exist. + # Check for exact match first. + if req_fmt in available_formats: + formats_to_test.append(req_fmt) + continue + + # If no exact match, check for formats that start with this ID + '-' and then digits + # e.g., req_fmt '140' should match '140-0' but not '140-something'. + prefix_match_re = re.compile(rf'^{re.escape(req_fmt)}-\d+$') + first_match = next((af for af in available_formats if prefix_match_re.match(af)), None) + + if first_match: + logger.info(f"[{display_name}] Requested format '{req_fmt}' not found. Using first available match: '{first_match}'.") + formats_to_test.append(first_match) + else: + logger.warning(f"[{display_name}] Requested format '{req_fmt}' not found in available formats and is not a recognized selector. Skipping this format.") + + except json.JSONDecodeError: + logger.error(f"[{display_name}] Failed to parse info.json. Skipping.") + return [] + + for i, format_id in enumerate(formats_to_test): + if state_manager.shutdown_event.is_set(): + logger.info(f"Shutdown requested, stopping further format tests for {display_name}.") + break + + # Check if the format URL is expired before attempting to download + format_details = next((f for f in info_data.get('formats', []) if f.get('format_id') == format_id), None) + + # If format_id is a complex selector, it won't be found directly. As a heuristic, + # check the expiration of the first format URL, as they typically share the same expiration. + if not format_details and any(c in format_id for c in '/+[]()'): + available_formats_list = info_data.get('formats', []) + if available_formats_list: + format_details = available_formats_list[0] + + # The check is enabled by default and can be disabled via policy. + if d_policy.get('check_url_expiration', True) and format_details and 'url' in format_details: + url_to_check = format_details['url'] + time_shift_minutes = d_policy.get('expire_time_shift_minutes', 0) + status, time_left_seconds = sp_utils.check_url_expiry(url_to_check, time_shift_minutes) + + logger.debug(f"[{display_name}] URL expiration check for format '{format_id}': status={status}, time_left={time_left_seconds:.0f}s") + + if status == 'expired': + details = "Download URL is expired" + if time_shift_minutes > 0 and time_left_seconds > 0: + logger.warning(f"[{display_name}] Skipping format '{format_id}' because its URL will expire in {time_left_seconds/60:.1f}m (within {time_shift_minutes}m time-shift).") + details = f"URL will expire within {time_shift_minutes}m time-shift" + else: + logger.warning(f"[{display_name}] Skipping format '{format_id}' because its URL is expired.") + + result = { + 'type': 'download', 'path': str(path), 'format': format_id, + 'success': False, 'error_type': 'Skipped (Expired URL)', + 'details': details, 'downloaded_bytes': 0, 'is_tolerated_error': True + } + if proxy_url: + result['proxy_url'] = proxy_url + + if profile_manager_instance and profile_name: + profile_manager_instance.record_activity(profile_name, 'tolerated_error') + + state_manager.log_event(result) + results.append(result) + continue # Move to the next format + + elif status == 'no_expiry_info': + logger.debug(f"[{display_name}] No valid 'expire' parameter found in format URL for '{format_id}'. Skipping expiration check.") + + result = run_download_worker(path, content, format_id, policy, args, running_processes, process_lock, state_manager, profile_name=profile_name) + if 'id' in info_data: + result['video_id'] = info_data['id'] + if proxy_url: + result['proxy_url'] = proxy_url + + # --- Cross-simulation state update --- + # After a download attempt, we must decrement the 'pending_downloads' counter + # on the original AUTH profile that generated this task. This allows the auth + # profile to be released from the WAITING_DOWNLOADS state. + metadata = info_data.get('_ytops_metadata', {}) + auth_profile_name = metadata.get('profile_name') + auth_env = metadata.get('auth_env') + + if auth_profile_name and auth_env and profile_manager_instance: + auth_manager = get_auth_manager(profile_manager_instance, auth_env) + if auth_manager: + activity_type = None + if result.get('success'): + activity_type = 'download' + elif result.get('is_tolerated_error'): + activity_type = 'tolerated_error' + elif result.get('error_type') != 'Cancelled': + activity_type = 'download_error' + + if activity_type: + logger.debug(f"[{display_name}] Decrementing pending download counter for auth profile '{auth_profile_name}' in env '{auth_env}'.") + auth_manager.decrement_pending_downloads(auth_profile_name) + else: + logger.warning(f"[{display_name}] Could not get auth manager for env '{auth_env}' to decrement pending counter.") + + # --- Record activity on the DOWNLOAD profile that performed the work --- + if profile_manager_instance and profile_name: + if result.get('success'): + profile_manager_instance.record_activity(profile_name, 'download') + elif result.get('error_type') == 'Cancelled': + pass # Do not record cancellations + elif result.get('is_tolerated_error'): + profile_manager_instance.record_activity(profile_name, 'tolerated_error') + else: + profile_manager_instance.record_activity(profile_name, 'download_error') + + state_manager.log_event(result) + results.append(result) + + worker_id = get_worker_id() + status = "SUCCESS" if result['success'] else f"FAILURE ({result['error_type']})" + profile_log_part = f" [Profile: {profile_name}]" if profile_name else "" + logger.info(f"[Worker {worker_id}]{profile_log_part} Result for {display_name} (format {format_id}): {status} - {result.get('details', 'OK')}") + + if not result['success']: + if s_conditions.get('on_failure') or \ + (s_conditions.get('on_http_403') and result['error_type'] == 'HTTP 403') or \ + (s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'): + logger.info(f"Stopping further format tests for {display_name} in this cycle due to failure.") + break + + sleep_cfg = d_policy.get('sleep_between_formats', {}) + sleep_min = sleep_cfg.get('min_seconds', 0) + if sleep_min > 0 and i < len(formats_to_test) - 1: + sleep_max = sleep_cfg.get('max_seconds') or sleep_min + if sleep_max > sleep_min: + sleep_duration = random.uniform(sleep_min, sleep_max) + else: + sleep_duration = sleep_min + + logger.debug(f"Sleeping for {sleep_duration:.2f}s between formats for {display_name}.") + # Interruptible sleep + sleep_end_time = time.time() + sleep_duration + while time.time() < sleep_end_time: + if state_manager.shutdown_event.is_set(): + break + time.sleep(0.2) + + return results diff --git a/ytops_client-source/ytops_client/stress_policy/workers.py b/ytops_client-source/ytops_client/stress_policy/workers.py index 5d5a079..c1bedb9 100644 --- a/ytops_client-source/ytops_client/stress_policy/workers.py +++ b/ytops_client-source/ytops_client/stress_policy/workers.py @@ -1,3043 +1,11 @@ -import collections -import json -import logging -import os -import random -import re -import shlex -import sys -import tempfile -import shutil -import subprocess -import threading -import time -from copy import deepcopy -from datetime import datetime, timezone -from pathlib import Path - -from . import utils as sp_utils -from .process_runners import run_command, run_docker_container, get_worker_id -from ..profile_manager_tool import ProfileManager - - -logger = logging.getLogger(__name__) - -# --- Auth Profile Manager Cache --- -# This is a cache to hold a ProfileManager instance for the auth simulation. -# It's needed so that download workers can decrement the correct pending download counter. -_auth_manager_cache = {} -_auth_manager_lock = threading.Lock() - -def get_auth_manager(current_manager, auth_env: str): - """ - Gets a ProfileManager instance for a specific auth simulation environment. - It uses the auth_env provided from the info.json metadata. - """ - with _auth_manager_lock: - if not auth_env: - return None - - if auth_env in _auth_manager_cache: - return _auth_manager_cache[auth_env] - - logger.info(f"Creating new ProfileManager for auth simulation env: '{auth_env}'") - try: - # Re-use connection settings from the current manager - redis_conn_kwargs = current_manager.redis.connection_pool.connection_kwargs - auth_key_prefix = f"{auth_env}_profile_mgmt_" - - auth_manager = ProfileManager( - redis_host=redis_conn_kwargs.get('host'), - redis_port=redis_conn_kwargs.get('port'), - redis_password=redis_conn_kwargs.get('password'), - key_prefix=auth_key_prefix - ) - _auth_manager_cache[auth_env] = auth_manager - return auth_manager - except Exception as e: - logger.error(f"Failed to create ProfileManager for auth env '{auth_env}': {e}") - return None - -def _run_ffprobe(media_path, output_path): - """Runs ffprobe on the media file and saves the JSON output.""" - try: - ffprobe_cmd = [ - 'ffprobe', - '-v', 'quiet', - '-print_format', 'json', - '-show_format', - '-show_streams', - str(media_path) - ] - result = subprocess.run(ffprobe_cmd, check=True, capture_output=True, text=True, encoding='utf-8') - - with open(output_path, 'w', encoding='utf-8') as f: - f.write(result.stdout) - logger.info(f"Successfully generated ffprobe JSON for '{media_path.name}' at '{output_path}'") - return True - except FileNotFoundError: - logger.error("ffprobe command not found. Please ensure ffprobe is installed and in your PATH.") - return False - except subprocess.CalledProcessError as e: - logger.error(f"ffprobe failed for '{media_path.name}': {e.stderr}") - return False - except Exception as e: - logger.error(f"An error occurred while running ffprobe for '{media_path.name}': {e}") - return False - -def _cleanup_media_file(media_path): - """Replaces the media file with an empty file ending in .empty.""" - try: - empty_path = Path(str(media_path) + '.empty') - empty_path.touch() - os.remove(media_path) - logger.info(f"Cleaned up media file '{media_path.name}', replaced with '{empty_path.name}'") - except Exception as e: - logger.error(f"Failed to cleanup media file '{media_path.name}': {e}") - -def _post_process_and_move_info_json(file_path, profile_name, proxy_url, policy, worker_id, state_manager, profile_manager_instance=None): - """Helper to post-process a single info.json file and move it to the final directory.""" - direct_policy = policy.get('direct_docker_cli_policy', {}) - settings = policy.get('settings', {}) - save_dir = settings.get('save_info_json_dir') - if not save_dir: - return False - - video_id = "unknown" - try: - # Use a short delay and retry mechanism to handle cases where the file is not yet fully written. - for attempt in range(3): - try: - with open(file_path, 'r+', encoding='utf-8') as f: - info_data = json.load(f) - video_id = info_data.get('id', 'unknown') - env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') if profile_manager_instance else 'unknown' - info_data['_ytops_metadata'] = { - 'profile_name': profile_name, - 'proxy_url': proxy_url, - 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), - 'auth_env': env_name - } - f.seek(0) - json.dump(info_data, f, indent=2) - f.truncate() - break # Success - except (json.JSONDecodeError, IOError) as e: - if attempt < 2: - time.sleep(0.2) - else: - raise e - - final_path = Path(save_dir) / file_path.name - rename_template = direct_policy.get('rename_file_template') - if rename_template: - sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) - new_name = rename_template.format( - video_id=video_id, profile_name=profile_name, proxy=sanitized_proxy - ) - final_path = Path(save_dir) / new_name - - # Use shutil.move, which can handle cross-device moves (e.g., to an S3 mount) - # by falling back to a copy-and-delete operation. - shutil.move(str(file_path), str(final_path)) - logger.info(f"[Worker {worker_id}] Post-processed and moved info.json to '{final_path}'") - - # --- Create download task if in queue mode --- - queue_policy = policy.get('queue_policy', {}) - formats_to_download = queue_policy.get('formats_to_download') - - if formats_to_download and state_manager and state_manager.queue_provider: - try: - url = info_data.get('original_url') or info_data.get('webpage_url') - - download_task = { - 'info_json_path': str(final_path), - 'video_id': video_id, - 'url': url, - 'auth_profile_name': profile_name, - 'proxy_url': proxy_url, - 'auth_env': env_name, - } - - added_count = state_manager.add_download_tasks_batch([download_task]) - if added_count > 0: - logger.info(f"[Worker {worker_id}] [{profile_name}] Added {added_count} download task(s) to queue for {video_id}") - else: - logger.error(f"[Worker {worker_id}] [{profile_name}] Failed to add download task to queue for {video_id}") - return False - - except Exception as e: - logger.error(f"[Worker {worker_id}] [{profile_name}] Failed to create download task for {video_id}: {e}", exc_info=True) - return False - - return True - except (IOError, json.JSONDecodeError, OSError) as e: - logger.error(f"[Worker {worker_id}] Error post-processing '{file_path.name}' (video: {video_id}): {e}") - return False - - -def find_task_and_lock_profile(profile_manager, owner_id, profile_prefix, policy, worker_id): - """ - Scans for an available task and locks the specific ACTIVE profile that generated it. - This preserves a 1-to-1 relationship between a profile and its tasks. - Returns a tuple of (locked_profile_dict, claimed_task_path_obj) or (None, None). - """ - settings = policy.get('settings', {}) - info_json_dir = settings.get('info_json_dir') - if not info_json_dir: - return None, None - - logger.info(f"[Worker {worker_id}] Scanning for tasks in '{info_json_dir}'...") - - # 1. Get all available task files and group by profile - try: - task_files = list(Path(info_json_dir).rglob('*.json')) - except FileNotFoundError: - logger.warning(f"Info JSON directory not found during scan: {info_json_dir}") - return None, None - - if not task_files: - return None, None - - profile_regex_str = settings.get('profile_extraction_regex') - if not profile_regex_str: - logger.error(f"[Worker {worker_id}] The task-locking strategy requires 'settings.profile_extraction_regex' to be defined in the policy.") - return None, None - try: - profile_regex = re.compile(profile_regex_str) - except re.error as e: - logger.error(f"Invalid profile_extraction_regex in policy: '{profile_regex_str}'. Error: {e}") - return None, None - - tasks_by_profile = collections.defaultdict(list) - for task_path in task_files: - match = profile_regex.search(task_path.name) - if match and match.groups(): - profile_name = match.group(1) - tasks_by_profile[profile_name].append(task_path) - - if not tasks_by_profile: - logger.debug(f"[Worker {worker_id}] Found task files, but could not extract any profile names from them.") - return None, None - - # 2. Get ACTIVE profiles from Redis. - active_profiles = profile_manager.list_profiles(state_filter='ACTIVE') - active_profile_names = {p['name'] for p in active_profiles if p['name'].startswith(profile_prefix or '')} - - # 3. Find profiles that are both ACTIVE and have tasks. - candidate_profiles = list(active_profile_names.intersection(tasks_by_profile.keys())) - - if not candidate_profiles: - logger.debug(f"[Worker {worker_id}] Found tasks for profiles: {list(tasks_by_profile.keys())}, but none are currently ACTIVE.") - return None, None - - # 4. Shuffle candidates and try to lock one. - random.shuffle(candidate_profiles) - - for profile_name in candidate_profiles: - # Try to lock the profile. - # The owner_id already contains the worker_id, so the log message from profile_manager will be informative enough. - locked_profile = profile_manager.lock_profile(owner=owner_id, specific_profile_name=profile_name) - if locked_profile: - # Success! Claim one of its task files. - # Shuffle the tasks for this profile to avoid workers grabbing the same one. - profile_tasks = tasks_by_profile[profile_name] - random.shuffle(profile_tasks) - - for task_path in profile_tasks: - locked_path = task_path.with_suffix(f"{task_path.suffix}.LOCKED.{worker_id}") - try: - task_path.rename(locked_path) - logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' and claimed its task '{task_path.name}'.") - return locked_profile, locked_path - except FileNotFoundError: - # This specific task was claimed by another worker between our scan and now. - # This is rare but possible if two workers lock the same profile and try to claim tasks. - # Let's just try the next task for this profile. - logger.warning(f"[Worker {worker_id}] Task '{task_path.name}' was claimed by another worker. Trying another task for '{profile_name}'.") - continue - except OSError as e: - logger.error(f"[Worker {worker_id}] Error claiming task file '{task_path.name}': {e}") - # Something is wrong with this file, try the next one. - continue - - # If we are here, we locked a profile but failed to claim any of its tasks. - # This is a weird state. We should unlock and move on. - logger.warning(f"[Worker {worker_id}] Locked profile '{profile_name}' but failed to claim any of its tasks. Unlocking.") - profile_manager.unlock_profile(profile_name, owner=owner_id) - - # No suitable task/profile combo found. - logger.debug(f"[Worker {worker_id}] Could not lock any of the {len(candidate_profiles)} candidate profiles (all may have been locked by other workers).") - return None, None - -def _run_download_logic(source, info_json_content, policy, state_manager, args, running_processes, process_lock, profile_name=None, profile_manager_instance=None): - """Shared download logic for a single info.json.""" - proxy_url = None - if info_json_content: - try: - info_data = json.loads(info_json_content) - proxy_url = info_data.get('_proxy_url') - except (json.JSONDecodeError, AttributeError): - logger.warning(f"[{sp_utils.get_display_name(source)}] Could not parse info.json to get proxy for download controls.") - - d_policy = policy.get('download_policy', {}) - temp_download_dir = None - local_policy = policy - - if d_policy.get('output_to_airflow_ready_dir'): - local_policy = deepcopy(policy) - temp_download_dir = tempfile.mkdtemp(prefix='stress-dl-video-') - # This modification is safe because it's on a deep copy - local_policy.setdefault('download_policy', {})['output_dir'] = temp_download_dir - logger.info(f"[{sp_utils.get_display_name(source)}] Using temporary download directory: {temp_download_dir}") - - try: - if not state_manager.check_and_update_download_rate_limit(proxy_url, local_policy): - return [] - - state_manager.wait_for_proxy_cooldown(proxy_url, local_policy) - results = process_info_json_cycle(source, info_json_content, local_policy, state_manager, args, running_processes, process_lock, proxy_url=proxy_url, profile_name=profile_name, -profile_manager_instance=profile_manager_instance) - state_manager.update_proxy_finish_time(proxy_url) - - # --- Post-download logic for ffprobe, Airflow dir, and cleanup --- - for result in results: - if result.get('success') and result.get('downloaded_filepath'): - downloaded_filepath_str = result.get('downloaded_filepath') - if not downloaded_filepath_str or not os.path.exists(downloaded_filepath_str): - continue - - media_path = Path(downloaded_filepath_str) - - # 1. Run ffprobe on the file at its current location - if d_policy.get('run_ffprobe'): - ffprobe_output_path = None - # The source is the path to the task/info.json file. - if isinstance(source, Path): - # Name it like the info.json but with .ffprobe.json - ffprobe_output_path = source.with_name(f"{source.stem}.ffprobe.json") - else: - # Fallback if source is not a path (e.g. from memory) - video_id = result.get('video_id', 'unknown_video') - # Save it next to the media file - ffprobe_output_path = media_path.with_name(f"{video_id}.ffprobe.json") - - _run_ffprobe(media_path, ffprobe_output_path) - - # 2. Move to Airflow directory if configured. This updates media_path. - if d_policy.get('output_to_airflow_ready_dir'): - try: - video_id = result.get('video_id') - if not video_id: - try: - info_data = json.loads(info_json_content) - video_id = info_data.get('id') - except (json.JSONDecodeError, AttributeError): - video_id = None - - if not video_id: - logger.error(f"[{sp_utils.get_display_name(source)}] Could not find video ID in result for moving file.") - else: - now = datetime.now() - rounded_minute = (now.minute // 10) * 10 - timestamp_str = now.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}" - - base_path = d_policy.get('airflow_ready_dir_base_path', 'downloadfiles/videos/ready') - if not os.path.isabs(base_path): - base_path = os.path.join(sp_utils._PROJECT_ROOT, base_path) - final_dir_base = os.path.join(base_path, timestamp_str) - final_dir_path = os.path.join(final_dir_base, video_id) - - os.makedirs(final_dir_path, exist_ok=True) - - if os.path.exists(media_path): - final_media_path = Path(final_dir_path) / media_path.name - shutil.move(str(media_path), str(final_media_path)) - logger.info(f"[{sp_utils.get_display_name(source)}] Moved media file to {final_media_path}") - media_path = final_media_path - - if isinstance(source, Path) and source.exists(): - new_info_json_name = f"info_{video_id}.json" - dest_info_json_path = os.path.join(final_dir_path, new_info_json_name) - shutil.copy(source, dest_info_json_path) - logger.info(f"[{sp_utils.get_display_name(source)}] Copied info.json to {dest_info_json_path}") - except Exception as e: - logger.error(f"[{sp_utils.get_display_name(source)}] Failed to move downloaded file to Airflow ready directory: {e}") - - # 3. Cleanup the file at its final location - if d_policy.get('cleanup'): - if os.path.exists(media_path): - _cleanup_media_file(media_path) - - return results - finally: - if temp_download_dir and os.path.exists(temp_download_dir): - shutil.rmtree(temp_download_dir) - logger.info(f"[{sp_utils.get_display_name(source)}] Cleaned up temporary directory: {temp_download_dir}") - - -def process_profile_task(profile_name, file_list, policy, state_manager, cycle_num, args, running_processes, process_lock, profile_manager_instance=None): - """Worker task for a profile, processing its files sequentially.""" - logger.info(f"Worker {get_worker_id()} starting task for profile '{profile_name}' with {len(file_list)} files.") - all_results = [] - for i, file_path in enumerate(file_list): - if state_manager.shutdown_event.is_set(): - logger.info(f"Shutdown requested, stopping task for profile '{profile_name}'.") - break - - try: - with open(file_path, 'r', encoding='utf-8') as f: - info_json_content = f.read() - except (IOError, FileNotFoundError) as e: - logger.error(f"[{sp_utils.get_display_name(file_path)}] Could not read info.json file: {e}") - continue # Skip this file - - results_for_file = _run_download_logic(file_path, info_json_content, policy, state_manager, args, running_processes, process_lock, profile_name=profile_name, profile_manager_instance=profile_manager_instance) - all_results.extend(results_for_file) - - # Mark file as processed if configured. This works for both 'once' and 'continuous' modes. - settings = policy.get('settings', {}) - if settings.get('directory_scan_mode') == 'continuous': - state_manager.mark_file_as_processed(file_path) - - if settings.get('mark_processed_files'): - try: - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - new_path = file_path.parent / f"{file_path.name}.{timestamp}.processed" - file_path.rename(new_path) - logger.info(f"Marked '{file_path.name}' as processed by renaming to '{new_path.name}'") - except (IOError, OSError) as e: - logger.error(f"Failed to rename processed file '{file_path.name}': {e}") - - # Check for stop conditions after processing each file - should_stop_profile = False - for result in results_for_file: - if not result['success']: - s_conditions = policy.get('stop_conditions', {}) - if s_conditions.get('on_failure') or \ - (s_conditions.get('on_http_403') and result['error_type'] == 'HTTP 403') or \ - (s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'): - logger.info(f"Stopping further processing for profile '{profile_name}' due to failure.") - should_stop_profile = True - break - if should_stop_profile: - break - - # Apply sleep between tasks for this profile - if i < len(file_list) - 1: - exec_control = policy.get('execution_control', {}) - sleep_cfg = exec_control.get('sleep_between_tasks', {}) - sleep_min = sleep_cfg.get('min_seconds', 0) - sleep_max_val = sleep_cfg.get('max_seconds') - - if sleep_min > 0 or sleep_max_val is not None: - sleep_max = sleep_min if sleep_max_val is None else sleep_max_val - - sleep_duration = 0 - if sleep_max < sleep_min: - logger.warning(f"sleep_between_tasks: max_seconds ({sleep_max}s) is less than min_seconds ({sleep_min}s). Using max_seconds as fixed sleep duration.") - sleep_duration = sleep_max - elif sleep_max > sleep_min: - sleep_duration = random.uniform(sleep_min, sleep_max) - else: # equal - sleep_duration = sleep_min - - if sleep_duration > 0: - logger.debug(f"Profile '{profile_name}' sleeping for {sleep_duration:.2f}s before next file.") - # Interruptible sleep - sleep_end_time = time.time() + sleep_duration - while time.time() < sleep_end_time: - if state_manager.shutdown_event.is_set(): - break - time.sleep(0.2) - - return all_results - - -def run_download_worker(info_json_path, info_json_content, format_to_download, policy, args, running_processes, process_lock, state_manager, profile_name=None): - """ - Performs a single download attempt. Designed to be run in a worker thread. - """ - worker_id = get_worker_id() - display_name = sp_utils.get_display_name(info_json_path) - profile_log_part = f" [Profile: {profile_name}]" if profile_name else "" - log_prefix = f"[Worker {worker_id}]{profile_log_part} [{display_name} @ {format_to_download}]" - - download_policy = policy.get('download_policy', {}) - settings = policy.get('settings', {}) - downloader = download_policy.get('downloader') - - # Get script command from settings, with fallback to download_policy for old format. - script_cmd_str = settings.get('download_script') - if not script_cmd_str: - script_cmd_str = download_policy.get('script') - - if script_cmd_str: - download_cmd = shlex.split(script_cmd_str) - elif downloader == 'aria2c_rpc': - download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'aria-rpc'] - elif downloader == 'native-cli': - download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'cli'] - else: - # Default to the new native-py downloader if downloader is 'native-py' or not specified. - download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'py'] - - download_cmd.extend(['-f', format_to_download]) - - if downloader == 'aria2c_rpc': - if download_policy.get('aria_host'): - download_cmd.extend(['--aria-host', str(download_policy['aria_host'])]) - if download_policy.get('aria_port'): - download_cmd.extend(['--aria-port', str(download_policy['aria_port'])]) - if download_policy.get('aria_secret'): - download_cmd.extend(['--aria-secret', str(download_policy['aria_secret'])]) - if download_policy.get('output_dir'): - download_cmd.extend(['--output-dir', str(download_policy['output_dir'])]) - if download_policy.get('aria_remote_dir'): - download_cmd.extend(['--remote-dir', str(download_policy['aria_remote_dir'])]) - if download_policy.get('aria_fragments_dir'): - download_cmd.extend(['--fragments-dir', str(download_policy['aria_fragments_dir'])]) - # For stress testing, waiting is the desired default to get a success/fail result. - # Allow disabling it by explicitly setting aria_wait: false in the policy. - if download_policy.get('aria_wait', True): - download_cmd.append('--wait') - - if download_policy.get('auto_merge_fragments'): - download_cmd.append('--auto-merge-fragments') - if download_policy.get('remove_fragments_after_merge'): - download_cmd.append('--remove-fragments-after-merge') - if download_policy.get('purge_on_complete'): - download_cmd.append('--purge-on-complete') - - downloader_args = download_policy.get('downloader_args') - proxy = download_policy.get('proxy') - if proxy: - # Note: proxy_rename is not supported for aria2c_rpc mode. - proxy_arg = f"--all-proxy {shlex.quote(str(proxy))}" - if downloader_args: - downloader_args = f"{downloader_args} {proxy_arg}" - else: - downloader_args = proxy_arg - - if downloader_args: - # For aria2c_rpc, the downloader_args value is passed directly to the script's --downloader-args option. - download_cmd.extend(['--downloader-args', downloader_args]) - elif downloader == 'native-cli': - # This is the logic for the legacy download_tool.py (yt-dlp CLI wrapper). - pause_seconds = download_policy.get('pause_before_download_seconds') - if pause_seconds and isinstance(pause_seconds, (int, float)) and pause_seconds > 0: - download_cmd.extend(['--pause', str(pause_seconds)]) - - if download_policy.get('continue_downloads'): - download_cmd.append('--download-continue') - - # Add proxy if specified directly in the policy - proxy = download_policy.get('proxy') - if proxy: - download_cmd.extend(['--proxy', str(proxy)]) - - proxy_rename = download_policy.get('proxy_rename') - if proxy_rename: - download_cmd.extend(['--proxy-rename', str(proxy_rename)]) - - extra_args = download_policy.get('extra_args') - if extra_args: - download_cmd.extend(shlex.split(extra_args)) - - # Note: 'downloader' here refers to yt-dlp's internal downloader, not our script. - # The policy key 'external_downloader' is more clear, but we support 'downloader' for backward compatibility. - ext_downloader = download_policy.get('external_downloader') or download_policy.get('downloader') - if ext_downloader and ext_downloader not in ['native-cli', 'native-py', 'aria2c_rpc']: - download_cmd.extend(['--downloader', str(ext_downloader)]) - - downloader_args = download_policy.get('downloader_args') - if downloader_args: - download_cmd.extend(['--downloader-args', str(downloader_args)]) - - if download_policy.get('merge_output_format'): - download_cmd.extend(['--merge-output-format', str(download_policy['merge_output_format'])]) - - if download_policy.get('merge_output_format'): - download_cmd.extend(['--merge-output-format', str(download_policy['merge_output_format'])]) - - else: - # This is the default logic for the new native-py downloader. - if download_policy.get('output_to_buffer'): - download_cmd.append('--output-buffer') - else: - # --output-dir is only relevant if not outputting to buffer. - if download_policy.get('output_dir'): - download_cmd.extend(['--output-dir', str(download_policy['output_dir'])]) - - if download_policy.get('config'): - download_cmd.extend(['--config', str(download_policy['config'])]) - - if download_policy.get('temp_path'): - download_cmd.extend(['--temp-path', str(download_policy['temp_path'])]) - if download_policy.get('continue_downloads'): - download_cmd.append('--download-continue') - - pause_seconds = download_policy.get('pause_before_download_seconds') - if pause_seconds and isinstance(pause_seconds, (int, float)) and pause_seconds > 0: - download_cmd.extend(['--pause', str(pause_seconds)]) - - proxy = download_policy.get('proxy') - if proxy: - download_cmd.extend(['--proxy', str(proxy)]) - - proxy_rename = download_policy.get('proxy_rename') - if proxy_rename: - download_cmd.extend(['--proxy-rename', str(proxy_rename)]) - - # The 'extra_args' from the policy are for the download script itself, not for yt-dlp. - # We need to split them and add them to the command. - extra_args = download_policy.get('extra_args') - if extra_args: - download_cmd.extend(shlex.split(extra_args)) - - # Pass through downloader settings for yt-dlp to use - # e.g. to tell yt-dlp to use aria2c as its backend - ext_downloader = download_policy.get('external_downloader') - if ext_downloader: - download_cmd.extend(['--downloader', str(ext_downloader)]) - - downloader_args = download_policy.get('downloader_args') - if downloader_args: - download_cmd.extend(['--downloader-args', str(downloader_args)]) - - if args.dummy: - # Create a copy to add the info.json path for logging, without modifying the original - log_cmd = list(download_cmd) - if isinstance(info_json_path, Path) and info_json_path.exists(): - log_cmd.extend(['--load-info-json', str(info_json_path)]) - else: - log_cmd.extend(['--load-info-json', '']) - - logger.info(f"{log_prefix} Dummy mode: simulating download...") - logger.info(f"{log_prefix} Dummy mode: Would run command: {' '.join(shlex.quote(s) for s in log_cmd)}") - - dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) - min_seconds = dummy_settings.get('download_min_seconds', 1.0) - max_seconds = dummy_settings.get('download_max_seconds', 3.0) - failure_rate = dummy_settings.get('download_failure_rate', 0.0) - skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0) - - sleep_duration = random.uniform(min_seconds, max_seconds) - logger.info(f"{log_prefix} Dummy mode: simulating download for {sleep_duration:.2f}s (from policy range {min_seconds}-{max_seconds}s).") - time.sleep(sleep_duration) # Simulate work - - rand_val = random.random() - should_fail_skipped = rand_val < skipped_rate - should_fail_fatal = not should_fail_skipped and rand_val < (skipped_rate + failure_rate) - - if should_fail_skipped: - logger.warning(f"{log_prefix} Dummy mode: Injecting simulated skipped download failure.") - return { - 'type': 'download', - 'path': str(info_json_path), - 'format': format_to_download, - 'success': False, - 'error_type': 'DummySkippedFailure', - 'details': 'FAIL (Dummy mode, skipped)', - 'downloaded_bytes': 0, - 'profile': profile_name, - 'downloaded_filepath': None, - 'is_tolerated_error': True - } - - if should_fail_fatal: - logger.warning(f"{log_prefix} Dummy mode: Injecting simulated fatal download failure.") - return { - 'type': 'download', - 'path': str(info_json_path), - 'format': format_to_download, - 'success': False, - 'error_type': 'DummyFailure', - 'details': 'FAIL (Dummy mode, fatal)', - 'downloaded_bytes': 0, - 'profile': profile_name, - 'downloaded_filepath': None - } - - downloaded_filepath = f'/dev/null/{display_name}.mp4' - if download_policy.get('output_to_airflow_ready_dir'): - output_dir = download_policy.get('output_dir') - if output_dir and os.path.isdir(output_dir): - try: - dummy_path_obj = Path(output_dir) / f"{display_name}.mp4" - dummy_path_obj.touch() - downloaded_filepath = str(dummy_path_obj) - logger.info(f"{log_prefix} Dummy mode: created dummy file for Airflow move: {downloaded_filepath}") - except OSError as e: - logger.error(f"{log_prefix} Dummy mode: failed to create dummy file in '{output_dir}': {e}") - - return { - 'type': 'download', - 'path': str(info_json_path), - 'format': format_to_download, - 'success': True, - 'error_type': None, - 'details': 'OK (Dummy mode)', - 'downloaded_bytes': random.randint(100000, 5000000), - 'profile': profile_name, - 'downloaded_filepath': downloaded_filepath - } - - logger.info(f"{log_prefix} Kicking off download process...") - - temp_info_file_path = None - try: - if isinstance(info_json_path, Path) and info_json_path.exists(): - # The info.json is already in a file, pass its path directly. - download_cmd.extend(['--load-info-json', str(info_json_path)]) - else: - # The info.json content is in memory, so write it to a temporary file. - import tempfile - with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', encoding='utf-8') as temp_f: - temp_f.write(info_json_content) - temp_info_file_path = temp_f.name - download_cmd.extend(['--load-info-json', temp_info_file_path]) - - cmd_str_for_log = ' '.join(shlex.quote(s) for s in download_cmd) - logger.info(f"{log_prefix} Running download command: {cmd_str_for_log}") - output_to_buffer = download_policy.get('output_to_buffer', False) - retcode, stdout, stderr = run_command( - download_cmd, - running_processes, - process_lock, - binary_stdout=output_to_buffer, - stream_output=getattr(args, 'print_downloader_log', False), - stream_prefix=f"{log_prefix} | " - ) - finally: - if temp_info_file_path and os.path.exists(temp_info_file_path): - os.unlink(temp_info_file_path) - - is_403_error = "HTTP Error 403" in stderr - is_timeout_error = "Read timed out" in stderr - output_to_buffer = download_policy.get('output_to_buffer', False) - - # Parse stdout to find the downloaded file path. - # The download scripts print the final path, sometimes with a prefix. - downloaded_filepath = None - if stdout and not output_to_buffer: - lines = stdout.strip().split('\n') - if lines: - last_line = lines[-1].strip() - - # Handle aria-rpc output format: "Download... successful: " - aria_match = re.search(r'successful: (.+)', last_line) - if aria_match: - path_from_aria = aria_match.group(1).strip() - if os.path.exists(path_from_aria): - downloaded_filepath = path_from_aria - else: - logger.warning(f"[{display_name}] Path from aria-rpc output does not exist: '{path_from_aria}'") - # Handle native-py/cli output format (just the path) - elif os.path.exists(last_line): - downloaded_filepath = last_line - - result = { - 'type': 'download', - 'path': str(info_json_path), - 'format': format_to_download, - 'success': retcode == 0, - 'error_type': None, - 'details': '', - 'downloaded_bytes': 0, - 'profile': profile_name, - 'downloaded_filepath': downloaded_filepath - } - - if retcode == 0: - details_str = "OK" - size_in_bytes = 0 - if output_to_buffer: - # The most accurate size is the length of the stdout buffer. - size_in_bytes = len(stdout) # stdout is bytes - details_str += f" (Buffered {sp_utils.format_size(size_in_bytes)})" - else: - size_match = re.search(r'\[download\]\s+100%\s+of\s+~?([0-9.]+)(B|KiB|MiB|GiB)', stderr) - if size_match: - value = float(size_match.group(1)) - unit = size_match.group(2) - multipliers = {"B": 1, "KiB": 1024, "MiB": 1024**2, "GiB": 1024**3} - size_in_bytes = int(value * multipliers.get(unit, 1)) - details_str += f" ({size_match.group(1)}{unit})" - - result['downloaded_bytes'] = size_in_bytes - result['details'] = details_str - else: - # Check for shutdown first. This is the most likely cause for an abrupt non-zero exit. - if state_manager.shutdown_event.is_set(): - result['error_type'] = 'Cancelled' - result['details'] = 'Task cancelled during shutdown.' - else: - # Check both stdout and stderr for error messages, as logging might be directed to stdout. - full_output = f"{stdout}\n{stderr}" - # Look for common error indicators from both yt-dlp and our scripts. - error_lines = [ - line for line in full_output.strip().split('\n') - if 'ERROR:' in line or ' - ERROR - ' in line or 'DownloadError:' in line or 'Traceback' in line - ] - if error_lines: - # Try to get the most specific part of the error. - details = error_lines[-1].strip() - # Our log format is "timestamp - logger - LEVEL - message" - if ' - ERROR - ' in details: - details = details.split(' - ERROR - ', 1)[-1] - result['details'] = details - else: - # Fallback to last non-empty line of stderr if no explicit "ERROR" line found - stderr_lines = [line for line in stderr.strip().split('\n') if line.strip()] - result['details'] = stderr_lines[-1].strip() if stderr_lines else "Unknown error" - - if is_403_error: - result['error_type'] = 'HTTP 403' - elif is_timeout_error: - result['error_type'] = 'Timeout' - else: - result['error_type'] = f'Exit Code {retcode}' - - return result - - -def process_info_json_cycle(path, content, policy, state_manager, args, running_processes, process_lock, proxy_url=None, profile_name=None, profile_manager_instance=None): - """ - Processes one info.json file for one cycle, downloading selected formats. - """ - results = [] - display_name = sp_utils.get_display_name(path) - d_policy = policy.get('download_policy', {}) - s_conditions = policy.get('stop_conditions', {}) - - try: - info_data = json.loads(content) - - # If the task file specifies a format, use it instead of the policy's format list. - # This is for granular tasks generated by the task-generator tool. - if '_ytops_download_format' in info_data: - format_selection = info_data['_ytops_download_format'] - logger.info(f"[{display_name}] Using format '{format_selection}' from task file.") - else: - format_selection = d_policy.get('formats', '') - available_formats = [f['format_id'] for f in info_data.get('formats', [])] - if not available_formats: - logger.warning(f"[{display_name}] No formats found in info.json. Skipping.") - return [] - - formats_to_test = [] - if format_selection == 'all': - formats_to_test = available_formats - elif format_selection.startswith('random:'): - percent = float(format_selection.split(':')[1].rstrip('%')) - count = max(1, int(len(available_formats) * (percent / 100.0))) - formats_to_test = random.sample(available_formats, k=count) - elif format_selection.startswith('random_from:'): - choices = [f.strip() for f in format_selection.split(':', 1)[1].split(',')] - valid_choices = [f for f in choices if f in available_formats] - if valid_choices: - formats_to_test = [random.choice(valid_choices)] - else: - # If the format selection contains complex selector characters (other than comma), - # treat the entire string as a single format selector for yt-dlp to interpret. - # Otherwise, split by comma to test each specified format ID individually. - if any(c in format_selection for c in '/+[]()'): - requested_formats = [format_selection] - else: - requested_formats = [f.strip() for f in format_selection.split(',') if f.strip()] - - formats_to_test = [] - selector_keywords = ('best', 'worst', 'bestvideo', 'bestaudio') - - for req_fmt in requested_formats: - # Treat as a selector and pass through if it contains special characters - # or starts with a known selector keyword. - if any(c in req_fmt for c in '/+[]()') or req_fmt.startswith(selector_keywords): - formats_to_test.append(req_fmt) - continue - - # Otherwise, treat as a specific format ID that must exist. - # Check for exact match first. - if req_fmt in available_formats: - formats_to_test.append(req_fmt) - continue - - # If no exact match, check for formats that start with this ID + '-' and then digits - # e.g., req_fmt '140' should match '140-0' but not '140-something'. - prefix_match_re = re.compile(rf'^{re.escape(req_fmt)}-\d+$') - first_match = next((af for af in available_formats if prefix_match_re.match(af)), None) - - if first_match: - logger.info(f"[{display_name}] Requested format '{req_fmt}' not found. Using first available match: '{first_match}'.") - formats_to_test.append(first_match) - else: - logger.warning(f"[{display_name}] Requested format '{req_fmt}' not found in available formats and is not a recognized selector. Skipping this format.") - - except json.JSONDecodeError: - logger.error(f"[{display_name}] Failed to parse info.json. Skipping.") - return [] - - for i, format_id in enumerate(formats_to_test): - if state_manager.shutdown_event.is_set(): - logger.info(f"Shutdown requested, stopping further format tests for {display_name}.") - break - - # Check if the format URL is expired before attempting to download - format_details = next((f for f in info_data.get('formats', []) if f.get('format_id') == format_id), None) - - # If format_id is a complex selector, it won't be found directly. As a heuristic, - # check the expiration of the first format URL, as they typically share the same expiration. - if not format_details and any(c in format_id for c in '/+[]()'): - available_formats_list = info_data.get('formats', []) - if available_formats_list: - format_details = available_formats_list[0] - - # The check is enabled by default and can be disabled via policy. - if d_policy.get('check_url_expiration', True) and format_details and 'url' in format_details: - url_to_check = format_details['url'] - time_shift_minutes = d_policy.get('expire_time_shift_minutes', 0) - status, time_left_seconds = sp_utils.check_url_expiry(url_to_check, time_shift_minutes) - - logger.debug(f"[{display_name}] URL expiration check for format '{format_id}': status={status}, time_left={time_left_seconds:.0f}s") - - if status == 'expired': - details = "Download URL is expired" - if time_shift_minutes > 0 and time_left_seconds > 0: - logger.warning(f"[{display_name}] Skipping format '{format_id}' because its URL will expire in {time_left_seconds/60:.1f}m (within {time_shift_minutes}m time-shift).") - details = f"URL will expire within {time_shift_minutes}m time-shift" - else: - logger.warning(f"[{display_name}] Skipping format '{format_id}' because its URL is expired.") - - result = { - 'type': 'download', 'path': str(path), 'format': format_id, - 'success': False, 'error_type': 'Skipped (Expired URL)', - 'details': details, 'downloaded_bytes': 0, 'is_tolerated_error': True - } - if proxy_url: - result['proxy_url'] = proxy_url - - if profile_manager_instance and profile_name: - profile_manager_instance.record_activity(profile_name, 'tolerated_error') - - state_manager.log_event(result) - results.append(result) - continue # Move to the next format - - elif status == 'no_expiry_info': - logger.debug(f"[{display_name}] No valid 'expire' parameter found in format URL for '{format_id}'. Skipping expiration check.") - - result = run_download_worker(path, content, format_id, policy, args, running_processes, process_lock, state_manager, profile_name=profile_name) - if 'id' in info_data: - result['video_id'] = info_data['id'] - if proxy_url: - result['proxy_url'] = proxy_url - - # Record download attempt/error if a profile is being used. - if profile_manager_instance and profile_name: - if result.get('success'): - profile_manager_instance.record_activity(profile_name, 'download') - elif result.get('error_type') == 'Cancelled': - pass # Do not record cancellations - elif result.get('is_tolerated_error'): - profile_manager_instance.record_activity(profile_name, 'tolerated_error') - else: - profile_manager_instance.record_activity(profile_name, 'download_error') - - state_manager.log_event(result) - results.append(result) - - worker_id = get_worker_id() - status = "SUCCESS" if result['success'] else f"FAILURE ({result['error_type']})" - profile_log_part = f" [Profile: {profile_name}]" if profile_name else "" - logger.info(f"[Worker {worker_id}]{profile_log_part} Result for {display_name} (format {format_id}): {status} - {result.get('details', 'OK')}") - - if not result['success']: - if s_conditions.get('on_failure') or \ - (s_conditions.get('on_http_403') and result['error_type'] == 'HTTP 403') or \ - (s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'): - logger.info(f"Stopping further format tests for {display_name} in this cycle due to failure.") - break - - sleep_cfg = d_policy.get('sleep_between_formats', {}) - sleep_min = sleep_cfg.get('min_seconds', 0) - if sleep_min > 0 and i < len(formats_to_test) - 1: - sleep_max = sleep_cfg.get('max_seconds') or sleep_min - if sleep_max > sleep_min: - sleep_duration = random.uniform(sleep_min, sleep_max) - else: - sleep_duration = sleep_min - - logger.debug(f"Sleeping for {sleep_duration:.2f}s between formats for {display_name}.") - # Interruptible sleep - sleep_end_time = time.time() + sleep_duration - while time.time() < sleep_end_time: - if state_manager.shutdown_event.is_set(): - break - time.sleep(0.2) - - return results - - -def run_throughput_worker(worker_id, policy, state_manager, args, profile_manager_instance, running_processes, process_lock, profile_prefix=None): - """A persistent worker for the 'throughput' orchestration mode.""" - owner_id = f"throughput-worker-{worker_id}" - settings = policy.get('settings', {}) - exec_control = policy.get('execution_control', {}) - - # Prioritize the passed-in profile_prefix for worker pool compatibility. - if not profile_prefix: - d_policy = policy.get('download_policy', {}) - profile_prefix = d_policy.get('profile_prefix') - - if not profile_prefix: - logger.error(f"[Worker {worker_id}] Throughput mode requires a 'profile_prefix' from the worker pool or 'download_policy'. Worker exiting.") - return [] - - no_task_streak = 0 - - while not state_manager.shutdown_event.is_set(): - locked_profile = None - claimed_task_path = None - try: - # 0. If no tasks were found previously, pause briefly. - if no_task_streak > 0: - polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - logger.info(f"[Worker {worker_id}] No tasks found in previous attempt(s). Pausing for {polling_interval}s. (Streak: {no_task_streak})") - time.sleep(polling_interval) - if state_manager.shutdown_event.is_set(): continue - - # 1. Find a task and lock its associated profile - locked_profile, claimed_task_path = find_task_and_lock_profile( - profile_manager_instance, owner_id, profile_prefix, policy, worker_id - ) - - if not locked_profile: - # No task/profile combo was available. - no_task_streak += 1 - polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - logger.info(f"[Worker {worker_id}] No available tasks found for any active profiles. Pausing for {polling_interval}s.") - time.sleep(polling_interval) - continue - - profile_name = locked_profile['name'] - - # We have a task and a lock. - if claimed_task_path: - no_task_streak = 0 # Reset streak - # 3. Process the task - try: - with open(claimed_task_path, 'r', encoding='utf-8') as f: - info_json_content = f.read() - except (IOError, FileNotFoundError) as e: - logger.error(f"[{sp_utils.get_display_name(claimed_task_path)}] Could not read claimed task file: {e}") - # Unlock profile and continue, file might be corrupted - profile_manager_instance.unlock_profile(profile_name, owner=owner_id) - locked_profile = None - # Clean up the bad file - try: claimed_task_path.unlink() - except OSError: pass - continue - - # The locked profile's proxy MUST be used for the download. - local_policy = deepcopy(policy) - local_policy.setdefault('download_policy', {})['proxy'] = locked_profile['proxy'] - - _run_download_logic( - source=claimed_task_path, - info_json_content=info_json_content, - policy=local_policy, - state_manager=state_manager, - args=args, - running_processes=running_processes, - process_lock=process_lock, - profile_name=profile_name, - profile_manager_instance=profile_manager_instance - ) - - # 4. Clean up the processed task file - try: - os.remove(claimed_task_path) - logger.debug(f"[{sp_utils.get_display_name(claimed_task_path)}] Removed processed task file.") - except OSError as e: - logger.error(f"Failed to remove processed task file '{claimed_task_path}': {e}") - else: - # This case should not be reached with the new task-first locking logic. - # If it is, it means find_task_and_lock_profile returned a profile but no task. - logger.warning(f"[Worker {worker_id}] Inconsistent state: locked profile '{profile_name}' but no task was claimed. Unlocking and continuing.") - - except Exception as e: - logger.error(f"[Worker {worker_id}] An unexpected error occurred in the worker loop: {e}", exc_info=True) - time.sleep(5) # Pause before retrying to avoid spamming errors - finally: - if locked_profile: - # 5. Unlock the profile. Only apply cooldown if a task was processed. - cooldown = None - if claimed_task_path: - # Enforcer is the only point where we configure to apply different policies, - # since we might restart enforcer, but won't restart stress-policy working on auth and downloads simultaneously. - # This is like applying a policy across multiple workers/machines without needing to restart each of them. - # DESIGN: The cooldown duration is not configured in the worker's policy. - # Instead, it is read from a central Redis key. This key is set by the - # policy-enforcer, making the enforcer the single source of truth for - # this policy. This allows changing the cooldown behavior without - # restarting the workers. - cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') - if cooldown_config: - try: - val = json.loads(cooldown_config) - if isinstance(val, list) and len(val) == 2 and val[0] < val[1]: - cooldown = random.randint(val[0], val[1]) - elif isinstance(val, int): - cooldown = val - except (json.JSONDecodeError, TypeError): - if cooldown_config.isdigit(): - cooldown = int(cooldown_config) - - if cooldown: - logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") - - profile_manager_instance.unlock_profile( - locked_profile['name'], - owner=owner_id, - rest_for_seconds=cooldown - ) - locked_profile = None - - # 6. Throughput is now controlled by the enforcer via the profile's - # 'unlock_cooldown_seconds' policy, which puts the profile into a - # RESTING state. The worker does not need to sleep here and can - # immediately try to lock a new profile to maximize throughput. - - logger.info(f"[Worker {worker_id}] Worker loop finished.") - return [] # This function doesn't return results directly - - -def run_direct_batch_worker(worker_id, policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock, profile_prefix=None): - """A worker for the 'direct_batch_cli' orchestration mode.""" - owner_id = f"direct-batch-worker-{worker_id}" - settings = policy.get('settings', {}) - exec_control = policy.get('execution_control', {}) - gen_policy = policy.get('info_json_generation_policy', {}) - direct_policy = policy.get('direct_batch_cli_policy', {}) - queue_policy = policy.get('queue_policy') - - # Prioritize the passed-in profile_prefix for worker pool compatibility. - if not profile_prefix: - profile_prefix = gen_policy.get('profile_prefix') - - if not profile_prefix: - logger.error(f"[Worker {worker_id}] Direct batch mode requires a 'profile_prefix' from the worker pool or 'info_json_generation_policy'. Worker exiting.") - return [] - - batch_size = direct_policy.get('batch_size') - if not batch_size: - logger.error(f"[Worker {worker_id}] Direct batch mode requires 'direct_batch_cli_policy.batch_size'. Worker exiting.") - return [] - - save_dir = settings.get('save_info_json_dir') - if not save_dir: - logger.error(f"[Worker {worker_id}] Direct batch mode requires 'settings.save_info_json_dir'. Worker exiting.") - return [] - - os.makedirs(save_dir, exist_ok=True) - - last_used_profile_name = None - while not state_manager.shutdown_event.is_set(): - locked_profile = None - temp_batch_file = None - # --- Variables for robust finalization --- - files_created = 0 - url_batch_len = 0 - batch_started = False - downloads_per_url = 0 # Default to 0, meaning no increment unless configured - # --- - try: - # 1. Lock a profile - locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) - - # --- New logic to avoid immediate reuse --- - avoid_reuse = direct_policy.get('avoid_immediate_profile_reuse', False) - if avoid_reuse and locked_profile and last_used_profile_name and locked_profile['name'] == last_used_profile_name: - logger.info(f"[Worker {worker_id}] Re-locked same profile '{locked_profile['name']}'. Unlocking and pausing to allow for rotation.") - profile_manager_instance.unlock_profile(locked_profile['name'], owner=owner_id) - - wait_seconds = direct_policy.get('avoid_reuse_max_wait_seconds', 5) - time.sleep(wait_seconds) - - # After waiting, try to lock again. - logger.info(f"[Worker {worker_id}] Attempting to lock a new profile after waiting.") - locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) - - if locked_profile and locked_profile['name'] == last_used_profile_name: - logger.warning(f"[Worker {worker_id}] Still locking the same profile '{locked_profile['name']}' after waiting. Proceeding to use it to avoid getting stuck.") - elif locked_profile: - logger.info(f"[Worker {worker_id}] Switched to a different profile after waiting: '{locked_profile['name']}'.") - # --- End new logic --- - - if not locked_profile: - polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - # --- Add diagnostic logging --- - all_profiles_in_pool = profile_manager_instance.list_profiles() - profiles_in_prefix = [p for p in all_profiles_in_pool if p['name'].startswith(profile_prefix)] - if profiles_in_prefix: - state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) - states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) - logger.info(f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix}*): {states_summary}. Pausing for {polling_interval}s.") - else: - logger.info(f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix}'. Pausing for {polling_interval}s.") - # --- End diagnostic logging --- - time.sleep(polling_interval) - continue - - profile_name = locked_profile['name'] - proxy_url = locked_profile['proxy'] - - # 2. Get a batch of URLs from the shared list - url_batch, start_idx = state_manager.get_next_url_batch(batch_size, urls_list) - if not url_batch: - logger.info(f"[Worker {worker_id}] No more URLs to process. Worker exiting.") - break # Exit the while loop - - url_batch_len = len(url_batch) - batch_started = True - - # --- Calculate how many download tasks will be generated --- - downloads_per_url = 0 # Default to 0, meaning no increment unless configured - downloads_per_url_config = gen_policy.get('downloads_per_url') - if downloads_per_url_config: - if isinstance(downloads_per_url_config, int): - downloads_per_url = downloads_per_url_config - elif downloads_per_url_config == 'from_download_policy': - download_policy = policy.get('download_policy', {}) - formats_str = download_policy.get('formats', '') - if formats_str: - # Use smarter parsing to handle complex yt-dlp format selectors - if any(c in formats_str for c in '/+[]()'): - num_formats = 1 - else: - num_formats = len([f for f in formats_str.split(',') if f.strip()]) - - if num_formats > 0: - downloads_per_url = num_formats - - if downloads_per_url > 0: - downloads_to_increment = url_batch_len * downloads_per_url - profile_manager_instance.increment_pending_downloads(profile_name, downloads_to_increment) - logger.info(f"[Worker {worker_id}] [{profile_name}] Preemptively incremented pending downloads by {downloads_to_increment} for the upcoming batch ({url_batch_len} URLs * {downloads_per_url} formats).") - else: - logger.warning(f"[Worker {worker_id}] [{profile_name}] 'downloads_per_url' is not configured or resolves to 0. Pending downloads counter will not be incremented for this batch.") - - end_idx = start_idx + len(url_batch) - logger.info(f"[Worker {worker_id}] [{profile_name}] Processing batch of {len(url_batch)} URLs (lines {start_idx + 1}-{end_idx} from source).") - - video_ids_in_batch = {sp_utils.get_video_id(u) for u in url_batch} - - # 3. Write URLs to a temporary batch file - with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as f: - temp_batch_file = f.name - f.write('\n'.join(url_batch)) - - # 4. Construct and run the command - ytdlp_cmd_str = direct_policy.get('ytdlp_command') - if not ytdlp_cmd_str: - logger.error(f"[Worker {worker_id}] Direct batch mode requires 'direct_batch_cli_policy.ytdlp_command'.") - break - - cmd = shlex.split(ytdlp_cmd_str) - cmd.extend(['--batch-file', temp_batch_file]) - cmd.extend(['--proxy', proxy_url]) - - # The output template should not include the .info.json extension, as - # yt-dlp adds it automatically when --write-info-json is used. - output_template_str = direct_policy.get('ytdlp_output_template', '%(id)s') - ytdlp_args = direct_policy.get('ytdlp_args') - - custom_env = direct_policy.get('env_vars', {}).copy() - - # --- PYTHONPATH for custom yt-dlp module --- - ytdlp_module_path = direct_policy.get('ytdlp_module_path') - if ytdlp_module_path: - existing_pythonpath = custom_env.get('PYTHONPATH', os.environ.get('PYTHONPATH', '')) - # Prepend the custom path to PYTHONPATH to give it precedence - custom_env['PYTHONPATH'] = f"{ytdlp_module_path}{os.pathsep}{existing_pythonpath}".strip(os.pathsep) - logger.debug(f"[Worker {worker_id}] Using custom PYTHONPATH: {custom_env['PYTHONPATH']}") - - custom_env['YTDLP_PROFILE_NAME'] = profile_name - custom_env['YTDLP_PROXY_URL'] = proxy_url - env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') - custom_env['YTDLP_SIM_MODE'] = env_name - - # Create a per-profile cache directory and set XDG_CACHE_HOME - cache_dir_base = direct_policy.get('cache_dir_base', '.cache') - profile_cache_dir = os.path.join(cache_dir_base, profile_name) - try: - os.makedirs(profile_cache_dir, exist_ok=True) - custom_env['XDG_CACHE_HOME'] = profile_cache_dir - except OSError as e: - logger.error(f"[Worker {worker_id}] Failed to create cache directory '{profile_cache_dir}': {e}") - - # --- Manage User-Agent --- - # Use a consistent User-Agent per profile, storing it in the profile's cache directory. - user_agent = None - user_agent_file = os.path.join(profile_cache_dir, 'user_agent.txt') - try: - if os.path.exists(user_agent_file): - with open(user_agent_file, 'r', encoding='utf-8') as f: - user_agent = f.read().strip() - - if not user_agent: # File doesn't exist or is empty - user_agent = sp_utils.generate_user_agent_from_policy(policy) - with open(user_agent_file, 'w', encoding='utf-8') as f: - f.write(user_agent) - logger.info(f"[{profile_name}] Generated and saved new User-Agent: '{user_agent}'") - else: - logger.info(f"[{profile_name}] Using existing User-Agent from cache: '{user_agent}'") - except IOError as e: - logger.error(f"[Worker {worker_id}] Error accessing User-Agent file '{user_agent_file}': {e}. Using generated UA for this run.") - user_agent = sp_utils.generate_user_agent_from_policy(policy) # fallback - - # Add proxy rename from policy if specified, for custom yt-dlp forks - proxy_rename = direct_policy.get('ytdlp_proxy_rename') - if proxy_rename: - custom_env['YTDLP_PROXY_RENAME'] = proxy_rename - - if user_agent: - cmd.extend(['--user-agent', user_agent]) - - if ytdlp_args: - cmd.extend(shlex.split(ytdlp_args)) - - if args.verbose and '--verbose' not in cmd: - cmd.append('--verbose') - - if args.dummy_batch: - # In dummy batch mode, we simulate the entire batch process directly. - log_cmd = list(cmd) - log_cmd.extend(['-o', os.path.join('temp_dir', output_template_str)]) - logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY BATCH MODE: Simulating batch of {len(url_batch)} URLs.") - logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY BATCH MODE: Would run real command: {' '.join(shlex.quote(s) for s in log_cmd)}") - logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY BATCH MODE: With environment: {custom_env}") - - dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) - auth_failure_rate = dummy_settings.get('auth_failure_rate', 0.0) - auth_skipped_rate = dummy_settings.get('auth_skipped_failure_rate', 0.0) - min_seconds = dummy_settings.get('auth_min_seconds', 0.1) - max_seconds = dummy_settings.get('auth_max_seconds', 0.5) - - for url in url_batch: - time.sleep(random.uniform(min_seconds, max_seconds)) - video_id = sp_utils.get_video_id(url) or f"dummy_{random.randint(1000, 9999)}" - - rand_val = random.random() - if rand_val < auth_skipped_rate: - logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating tolerated failure for {video_id}.") - profile_manager_instance.record_activity(profile_name, 'tolerated_error') - elif rand_val < (auth_skipped_rate + auth_failure_rate): - logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating fatal failure for {video_id}.") - profile_manager_instance.record_activity(profile_name, 'failure') - else: - # Success - create dummy info.json - files_created += 1 - profile_manager_instance.record_activity(profile_name, 'success') - info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True} - env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') - info_data['_ytops_metadata'] = { - 'profile_name': profile_name, 'proxy_url': proxy_url, - 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), - 'auth_env': env_name - } - - final_path = Path(save_dir) / f"{video_id}.info.json" - rename_template = direct_policy.get('rename_file_template') - if rename_template: - sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) - new_name = rename_template.format(video_id=video_id, profile_name=profile_name, proxy=sanitized_proxy) - final_path = Path(save_dir) / new_name - - try: - with open(final_path, 'w', encoding='utf-8') as f: - json.dump(info_data, f, indent=2) - logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY: Created dummy info.json: '{final_path}'") - except IOError as e: - logger.error(f"[Worker {worker_id}] [{profile_name}] DUMMY: Failed to write dummy info.json: {e}") - - success = (files_created > 0) - state_manager.record_batch_result(success, len(url_batch), profile_name=profile_name) - event = { 'type': 'fetch_batch', 'profile': profile_name, 'proxy_url': proxy_url, 'success': success, 'details': f"Dummy batch completed. Files created: {files_created}/{len(url_batch)}.", 'video_count': len(url_batch) } - state_manager.log_event(event) - - else: - with tempfile.TemporaryDirectory(prefix=f"ytdlp-batch-{worker_id}-") as temp_output_dir: - output_template = os.path.join(temp_output_dir, output_template_str) - cmd.extend(['-o', output_template]) - - logger.info(f"[Worker {worker_id}] [{profile_name}] Processing batch of {len(url_batch)} URLs...") - logger.info(f"[Worker {worker_id}] [{profile_name}] Running command: {' '.join(shlex.quote(s) for s in cmd)}") - logger.info(f"[Worker {worker_id}] [{profile_name}] With environment: {custom_env}") - retcode, stdout, stderr = run_command( - cmd, running_processes, process_lock, env=custom_env, stream_output=args.verbose, - stream_prefix=f"[Worker {worker_id} | yt-dlp] " - ) - - is_bot_error = "Sign in to confirm you're not a bot" in stderr - if is_bot_error: - logger.warning(f"[Worker {worker_id}] [{profile_name}] Bot detection occurred during batch. Marking as failure.") - - processed_files = list(Path(temp_output_dir).glob('*.json')) - - for temp_path in processed_files: - files_created += 1 - video_id = "unknown" - try: - with open(temp_path, 'r+', encoding='utf-8') as f: - info_data = json.load(f) - video_id = info_data.get('id', 'unknown') - env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') - info_data['_ytops_metadata'] = { - 'profile_name': profile_name, - 'proxy_url': proxy_url, - 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), - 'auth_env': env_name - } - f.seek(0) - json.dump(info_data, f, indent=2) - f.truncate() - - final_path = Path(save_dir) / temp_path.name - rename_template = direct_policy.get('rename_file_template') - if rename_template: - sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) - new_name = rename_template.format( - video_id=video_id, profile_name=profile_name, proxy=sanitized_proxy - ) - final_path = Path(save_dir) / new_name - - shutil.move(str(temp_path), str(final_path)) - logger.info(f"[Worker {worker_id}] Post-processed and moved info.json to '{final_path}'") - - except (IOError, json.JSONDecodeError, OSError) as e: - logger.error(f"[Worker {worker_id}] Error post-processing '{temp_path.name}' (video: {video_id}): {e}") - - # The orchestrator records per-URL success/failure for the profile. - # A batch is considered an overall success for logging if it had no fatal errors - # and produced at least one file. - success = (files_created > 0 and not is_bot_error) - - if not success: - reason = "bot detection occurred" if is_bot_error else f"0 files created out of {len(url_batch)}" - logger.warning(f"[Worker {worker_id}] [{profile_name}] Marking batch as FAILED. Reason: {reason}.") - - # Record batch stats for overall orchestrator health - state_manager.record_batch_result(success, len(url_batch), profile_name=profile_name) - - # In this mode, the custom yt-dlp script is responsible for recording - # per-URL activity ('success', 'failure', 'tolerated_error') directly into Redis. - # The orchestrator does not record activity here to avoid double-counting. - logger.info(f"[Worker {worker_id}] [{profile_name}] Batch finished. Per-URL activity was recorded by the yt-dlp script.") - - event_details = f"Batch completed. Exit: {retcode}. Files created: {files_created}/{len(url_batch)}." - if not success and stderr: - if is_bot_error: - event_details += " Stderr: Bot detection occurred." - else: - event_details += f" Stderr: {stderr.strip().splitlines()[-1]}" - - event = { 'type': 'fetch_batch', 'profile': profile_name, 'proxy_url': proxy_url, 'success': success, 'details': event_details, 'video_count': len(url_batch) } - state_manager.log_event(event) - - except Exception as e: - logger.error(f"[Worker {worker_id}] Unexpected error in worker loop: {e}", exc_info=True) - if locked_profile: - profile_manager_instance.record_activity(locked_profile['name'], 'failure') - finally: - if locked_profile and batch_started: - # --- Reconcile pending downloads counter --- - if downloads_per_url > 0: - initial_increment = url_batch_len * downloads_per_url - actual_downloads = files_created * downloads_per_url - adjustment = actual_downloads - initial_increment - if adjustment != 0: - logger.warning(f"[Worker {worker_id}] [{profile_name}] Reconciling pending downloads. Batch created {files_created}/{url_batch_len} successful info.json(s). Adjusting counter by {adjustment}.") - profile_manager_instance.increment_pending_downloads(locked_profile['name'], adjustment) - - if locked_profile: - last_used_profile_name = locked_profile['name'] - cooldown = None - # DESIGN: The cooldown duration is not configured in the worker's policy. - # Instead, it is read from a central Redis key. This key is set by the - # policy-enforcer, making the enforcer the single source of truth for - # this policy. This allows changing the cooldown behavior without - # restarting the workers. - cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') - if cooldown_config: - try: - val = json.loads(cooldown_config) - if isinstance(val, list) and len(val) == 2 and val[0] < val[1]: - cooldown = random.randint(val[0], val[1]) - elif isinstance(val, int): - cooldown = val - except (json.JSONDecodeError, TypeError): - if cooldown_config.isdigit(): - cooldown = int(cooldown_config) - - if cooldown: - logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") - - profile_manager_instance.unlock_profile( - locked_profile['name'], - owner=owner_id, - rest_for_seconds=cooldown - ) - if temp_batch_file and os.path.exists(temp_batch_file): - os.unlink(temp_batch_file) - - logger.info(f"[Worker {worker_id}] Worker loop finished.") - return [] - - -def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock, profile_prefix=None): - """A worker for the 'direct_docker_cli' orchestration mode (fetch_only).""" - owner_id = f"direct-docker-worker-{worker_id}" - settings = policy.get('settings', {}) - exec_control = policy.get('execution_control', {}) - gen_policy = policy.get('info_json_generation_policy', {}) - direct_policy = policy.get('direct_docker_cli_policy', {}) - queue_policy = policy.get('queue_policy') - - # Prioritize the passed-in profile_prefix for worker pool compatibility. - # If it's not passed (e.g. legacy 'workers' mode), fall back to policy. - if not profile_prefix: - profile_prefix = gen_policy.get('profile_prefix') - - # Unlike other modes, this worker can function without a prefix (it will try to lock any active profile). - # The check `if not profile_prefix` is removed to allow this flexibility. - - batch_size = direct_policy.get('batch_size') - if not batch_size and queue_policy: - batch_size = queue_policy.get('batch_size') - - if not batch_size: - logger.error(f"[Worker {worker_id}] Direct docker mode requires 'batch_size' in 'direct_docker_cli_policy' or 'queue_policy'. Worker exiting.") - return [] - - save_dir = settings.get('save_info_json_dir') - if not save_dir: - logger.error(f"[Worker {worker_id}] Direct docker mode requires 'settings.save_info_json_dir'. Worker exiting.") - return [] - os.makedirs(save_dir, exist_ok=True) - - # --- Docker specific config --- - image_name = direct_policy.get('docker_image_name') - host_mount_path = os.path.abspath(direct_policy.get('docker_host_mount_path')) - container_mount_path = direct_policy.get('docker_container_mount_path') - host_cache_path = direct_policy.get('docker_host_cache_path') - if host_cache_path: host_cache_path = os.path.abspath(host_cache_path) - container_cache_path = direct_policy.get('docker_container_cache_path') - network_name = direct_policy.get('docker_network_name') - - if not all([image_name, host_mount_path, container_mount_path]): - logger.error(f"[Worker {worker_id}] Direct docker mode requires 'docker_image_name', 'docker_host_mount_path', and 'docker_container_mount_path'. Worker exiting.") - return [] - - try: - os.makedirs(host_mount_path, exist_ok=True) - except OSError as e: - logger.error(f"[Worker {worker_id}] Could not create docker_host_mount_path '{host_mount_path}': {e}. Worker exiting.") - return [] - - last_used_profile_name = None - while not state_manager.shutdown_event.is_set(): - locked_profile = None - temp_task_dir_host = None - task_batch = [] - # --- Variables for robust finalization --- - live_success_count = 0 - url_batch_len = 0 - batch_started = False - downloads_per_url = 0 # Default to 0, meaning no increment unless configured - # --- - try: - # 1. Lock a profile - locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) - - # --- New logic to avoid immediate reuse --- - avoid_reuse = direct_policy.get('avoid_immediate_profile_reuse', False) - if avoid_reuse and locked_profile and last_used_profile_name and locked_profile['name'] == last_used_profile_name: - logger.info(f"[Worker {worker_id}] Re-locked same profile '{locked_profile['name']}'. Unlocking and pausing to allow for rotation.") - profile_manager_instance.unlock_profile(locked_profile['name'], owner=owner_id) - - wait_seconds = direct_policy.get('avoid_reuse_max_wait_seconds', 5) - time.sleep(wait_seconds) - - # After waiting, try to lock again. - logger.info(f"[Worker {worker_id}] Attempting to lock a new profile after waiting.") - locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) - - if locked_profile and locked_profile['name'] == last_used_profile_name: - logger.warning(f"[Worker {worker_id}] Still locking the same profile '{locked_profile['name']}' after waiting. Proceeding to use it to avoid getting stuck.") - elif locked_profile: - logger.info(f"[Worker {worker_id}] Switched to a different profile after waiting: '{locked_profile['name']}'.") - # --- End new logic --- - - if not locked_profile: - polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - # --- Add diagnostic logging --- - all_profiles_in_pool = profile_manager_instance.list_profiles() - profiles_in_prefix = [p for p in all_profiles_in_pool if p['name'].startswith(profile_prefix or '')] - if profiles_in_prefix: - state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) - states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) - logger.info(f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s.") - else: - logger.info(f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s.") - # --- End diagnostic logging --- - time.sleep(polling_interval) - continue - - profile_name = locked_profile['name'] - proxy_url = locked_profile['proxy'] - - # --- Manage User-Agent, Visitor ID, and Cache Directory --- - user_agent = None - visitor_id = None - environment = {} - profile_cache_dir_host = None - if host_cache_path: - profile_cache_dir_host = os.path.join(host_cache_path, profile_name) - try: - os.makedirs(profile_cache_dir_host, exist_ok=True) - if container_cache_path: - environment['XDG_CACHE_HOME'] = container_cache_path - - # --- User-Agent --- - user_agent_file = os.path.join(profile_cache_dir_host, 'user_agent.txt') - if os.path.exists(user_agent_file): - with open(user_agent_file, 'r', encoding='utf-8') as f: - user_agent = f.read().strip() - - if not user_agent: - user_agent = sp_utils.generate_user_agent_from_policy(policy) - with open(user_agent_file, 'w', encoding='utf-8') as f: - f.write(user_agent) - logger.info(f"[{profile_name}] Generated and saved new User-Agent: '{user_agent}'") - else: - logger.info(f"[{profile_name}] Using existing User-Agent from cache: '{user_agent}'") - - # --- Visitor ID --- - if direct_policy.get('track_visitor_id'): - visitor_id_file = os.path.join(profile_cache_dir_host, 'visitor_id.txt') - if os.path.exists(visitor_id_file): - with open(visitor_id_file, 'r', encoding='utf-8') as f: - visitor_id = f.read().strip() - if visitor_id: - logger.info(f"[{profile_name}] Using existing Visitor ID from cache: '{visitor_id}'") - - except IOError as e: - logger.error(f"[Worker {worker_id}] Error accessing cache file in '{profile_cache_dir_host}': {e}. Using generated UA for this run.") - user_agent = sp_utils.generate_user_agent_from_policy(policy) # Fallback for UA - else: - # Fallback if no cache is configured for auth simulation - user_agent = sp_utils.generate_user_agent_from_policy(policy) - - # 2. Get a batch of URLs - url_batch = [] - start_idx = 0 - - if not queue_policy: - url_batch, start_idx = state_manager.get_next_url_batch(batch_size, urls_list) - else: - task_batch = state_manager.get_auth_tasks_batch(batch_size) - if task_batch: - # Mark tasks as in-progress - for task in task_batch: - task_id = task.get('id') or task.get('task_id') - if task_id: - state_manager.mark_auth_in_progress(task_id, owner_id) - url_batch = [task.get('url') for task in task_batch if task.get('url')] - - if not url_batch: - if not queue_policy: - logger.info(f"[Worker {worker_id}] No more URLs to process. Worker exiting.") - break # Exit the while loop - else: - # Queue mode: no tasks, unlock and poll - polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - logger.debug(f"[Worker {worker_id}] No tasks in queue for profile '{profile_name}'. Unlocking and sleeping for {polling_interval}s.") - profile_manager_instance.unlock_profile(profile_name, owner=owner_id) - locked_profile = None - time.sleep(polling_interval) - continue - - url_batch_len = len(url_batch) - batch_started = True - - # --- Calculate how many download tasks will be generated --- - downloads_per_url = 0 # Default to 0, meaning no increment unless configured - downloads_per_url_config = gen_policy.get('downloads_per_url') - if downloads_per_url_config: - if isinstance(downloads_per_url_config, int): - downloads_per_url = downloads_per_url_config - elif downloads_per_url_config == 'from_download_policy': - download_policy = policy.get('download_policy', {}) - formats_str = download_policy.get('formats', '') - if formats_str: - # Use smarter parsing to handle complex yt-dlp format selectors - if any(c in formats_str for c in '/+[]()'): - num_formats = 1 - else: - num_formats = len([f for f in formats_str.split(',') if f.strip()]) - - if num_formats > 0: - downloads_per_url = num_formats - - if downloads_per_url > 0: - downloads_to_increment = url_batch_len * downloads_per_url - profile_manager_instance.increment_pending_downloads(profile_name, downloads_to_increment) - logger.info(f"[Worker {worker_id}] [{profile_name}] Preemptively incremented pending downloads by {downloads_to_increment} for the upcoming batch ({url_batch_len} URLs * {downloads_per_url} formats).") - else: - logger.warning(f"[Worker {worker_id}] [{profile_name}] 'downloads_per_url' is not configured or resolves to 0. Pending downloads counter will not be incremented for this batch.") - - end_idx = start_idx + len(url_batch) - logger.info(f"[Worker {worker_id}] [{profile_name}] Processing batch of {len(url_batch)} URLs (lines {start_idx + 1}-{end_idx} from source).") - - # 3. Prepare files on the host - temp_task_dir_host = tempfile.mkdtemp(prefix=f"docker-task-{worker_id}-", dir=host_mount_path) - task_dir_name = os.path.basename(temp_task_dir_host) - task_dir_container = os.path.join(container_mount_path, task_dir_name) - - # The config file path is passed explicitly to yt-dlp via --config-locations, - # so setting XDG_CONFIG_HOME in the environment is redundant. - - # Write batch file - temp_batch_file_host = os.path.join(temp_task_dir_host, 'batch.txt') - with open(temp_batch_file_host, 'w', encoding='utf-8') as f: - f.write('\n'.join(url_batch)) - - # Write yt-dlp config file - base_config_content = "" - base_config_file = direct_policy.get('ytdlp_config_file') - if base_config_file: - # Try path as-is first, then relative to project root. - config_path_to_read = Path(base_config_file) - if not config_path_to_read.exists(): - config_path_to_read = Path(sp_utils._PROJECT_ROOT) / base_config_file - - if config_path_to_read.exists(): - try: - with open(config_path_to_read, 'r', encoding='utf-8') as f: - base_config_content = f.read() - logger.info(f"[Worker {worker_id}] [{profile_name}] Loaded base config from '{config_path_to_read}'") - except IOError as e: - logger.error(f"[Worker {worker_id}] Could not read ytdlp_config_file '{config_path_to_read}': {e}") - else: - logger.error(f"[Worker {worker_id}] Could not find ytdlp_config_file: '{base_config_file}'") - - config_overrides = direct_policy.get('ytdlp_config_overrides', {}).copy() - - if direct_policy.get('use_cookies') and host_cache_path and container_cache_path: - try: - cookie_file_host = os.path.join(profile_cache_dir_host, 'cookies.txt') - # Ensure the file exists and has the Netscape header if it's empty. - if not os.path.exists(cookie_file_host) or os.path.getsize(cookie_file_host) == 0: - with open(cookie_file_host, 'w', encoding='utf-8') as f: - f.write("# Netscape HTTP Cookie File\n") - logger.info(f"[{profile_name}] Created/initialized cookie file with header: {cookie_file_host}") - - cookie_file_container = os.path.join(container_cache_path, 'cookies.txt') - config_overrides['cookies'] = cookie_file_container - logger.info(f"[{profile_name}] Using persistent cookie jar: {cookie_file_host}") - except (IOError, OSError) as e: - logger.error(f"[Worker {worker_id}] Could not create cookie file in '{profile_cache_dir_host}': {e}") - - # Inject per-task values into overrides - config_overrides['proxy'] = proxy_url - config_overrides['batch-file'] = os.path.join(task_dir_container, 'batch.txt') - # The output template should not include the .info.json extension, as - # yt-dlp adds it automatically when --write-info-json is used. - config_overrides['output'] = os.path.join(task_dir_container, '%(id)s') - if user_agent: - config_overrides['user-agent'] = user_agent - - overrides_content = sp_utils._config_dict_to_flags_file_content(config_overrides) - raw_args_from_policy = direct_policy.get('ytdlp_raw_args', []) - - # --- Inject visitor_id into raw args if available --- - if visitor_id: - # Start with a copy of the raw args from policy - new_raw_args = list(raw_args_from_policy) - - # --- Handle youtube extractor args --- - youtube_arg_index = -1 - original_youtube_value = None - for i, arg in enumerate(new_raw_args): - if arg.startswith('--extractor-args') and 'youtube:' in arg: - youtube_arg_index = i - try: - parts = shlex.split(arg) - if len(parts) == 2 and parts[1].startswith('youtube:'): - original_youtube_value = parts[1] - except ValueError: - logger.warning(f"Could not parse extractor-arg, will not modify: {arg}") - break # Found it, stop searching - - if youtube_arg_index != -1 and original_youtube_value: - # Modify existing youtube arg - new_value = f'{original_youtube_value.rstrip()};visitor_data={visitor_id}' - if 'skip=' in new_value: - new_value = re.sub(r'skip=([^;\'"]*)', r'skip=\1,webpage,configs', new_value) - else: - new_value += ';skip=webpage,configs' - new_raw_args[youtube_arg_index] = f'--extractor-args "{new_value}"' - else: - # Add new youtube arg - logger.warning(f"[{profile_name}] No existing '--extractor-args youtube:...' found. Adding a new one for visitor_id.") - new_raw_args.append(f'--extractor-args "youtube:visitor_data={visitor_id};skip=webpage,configs"') - - # --- Handle youtubetab extractor args --- - youtubetab_arg_index = -1 - for i, arg in enumerate(new_raw_args): - if arg.startswith('--extractor-args') and 'youtubetab:' in arg: - youtubetab_arg_index = i - break - - # The request is to set/replace this argument - new_youtubetab_arg = '--extractor-args "youtubetab:skip=webpage"' - if youtubetab_arg_index != -1: - # Replace existing - new_raw_args[youtubetab_arg_index] = new_youtubetab_arg - else: - # Add new - new_raw_args.append(new_youtubetab_arg) - - raw_args_from_policy = new_raw_args - # --- End visitor_id injection --- - - raw_args_content = '\n'.join(raw_args_from_policy) - - config_content = f"{base_config_content.strip()}\n\n# --- Overrides from policy ---\n{overrides_content}" - if raw_args_content: - config_content += f"\n\n# --- Raw args from policy ---\n{raw_args_content}" - - logger.info(f"[Worker {worker_id}] [{profile_name}] Generated yt-dlp config file content:\n---config---\n{config_content}\n------------") - - # Create the directory structure yt-dlp expects inside the temp task dir - ytdlp_config_dir_host = os.path.join(temp_task_dir_host, 'yt-dlp') - os.makedirs(ytdlp_config_dir_host, exist_ok=True) - temp_config_file_host = os.path.join(ytdlp_config_dir_host, 'config') - with open(temp_config_file_host, 'w', encoding='utf-8') as f: - f.write(config_content) - - # 4. Construct and run the 'docker run' command - volumes = { - host_mount_path: { - 'bind': container_mount_path, - 'mode': 'rw' - } - } - if host_cache_path and container_cache_path: - profile_cache_dir_host = os.path.join(host_cache_path, profile_name) - os.makedirs(profile_cache_dir_host, exist_ok=True) - volumes[profile_cache_dir_host] = { - 'bind': container_cache_path, - 'mode': 'rw' - } - - # The command tells yt-dlp exactly where to find the config file we created. - command = ['yt-dlp', '--config-locations', os.path.join(task_dir_container, 'yt-dlp/config')] - logger.info(f"[Worker {worker_id}] [{profile_name}] Running docker command: {' '.join(shlex.quote(s) for s in command)}") - - # For logging purposes, construct the full equivalent command line with host paths - log_config_overrides_for_host = config_overrides.copy() - log_config_overrides_for_host['batch-file'] = temp_batch_file_host - log_config_overrides_for_host['output'] = os.path.join(temp_task_dir_host, '%(id)s') - if 'cookies' in log_config_overrides_for_host and host_cache_path: - log_config_overrides_for_host['cookies'] = os.path.join(profile_cache_dir_host, 'cookies.txt') - - log_command_override = ['yt-dlp'] - if base_config_content: - log_command_override.extend(sp_utils._parse_config_file_to_cli_args(base_config_content)) - log_command_override.extend(sp_utils._config_dict_to_cli_flags(log_config_overrides_for_host)) - for raw_arg in raw_args_from_policy: - log_command_override.extend(shlex.split(raw_arg)) - - # --- Live log parsing and activity recording --- - live_failure_count = 0 - live_tolerated_count = 0 - activity_lock = threading.Lock() - - # Get error patterns from policy for live parsing - tolerated_error_patterns = direct_policy.get('tolerated_error_patterns', []) - fatal_error_patterns = direct_policy.get('fatal_error_patterns', []) - - def log_parser_callback(line): - nonlocal live_success_count, live_failure_count, live_tolerated_count - - # --- Visitor ID Extraction --- - if direct_policy.get('track_visitor_id') and profile_cache_dir_host: - # e.g., [debug] [youtube] [pot:cache] TRACE: Retrieved cache spec PoTokenCacheSpec(key_bindings={'t': 'webpo', 'cb': '...', 'cbt': 'visitor_id', ... - match = re.search(r"'cb': '([^']*)', 'cbt': 'visitor_id'", line) - if match: - new_visitor_id = match.group(1) - logger.info(f"[Worker {worker_id}] [{profile_name}] Detected new Visitor ID: {new_visitor_id}") - try: - visitor_id_file = os.path.join(profile_cache_dir_host, 'visitor_id.txt') - with open(visitor_id_file, 'w', encoding='utf-8') as f: - f.write(new_visitor_id) - logger.info(f"[{profile_name}] Saved new Visitor ID to cache.") - except IOError as e: - logger.error(f"[{profile_name}] Failed to save new Visitor ID to cache: {e}") - - # Success is the highest priority check - if '[info] Writing video metadata as JSON to:' in line: - post_processed_successfully = False - # --- Immediate post-processing --- - try: - path_match = re.search(r"Writing video metadata as JSON to: '?([^']+)'?$", line) - if not path_match: - path_match = re.search(r"Writing video metadata as JSON to: (.*)$", line) - - if path_match: - container_file_path = path_match.group(1).strip() - - if container_file_path.startswith(container_mount_path): - relative_path = os.path.relpath(container_file_path, container_mount_path) - host_file_path = os.path.join(host_mount_path, relative_path) - - # The file might not exist immediately. - for _ in range(5): # Retry for up to 0.5s - if os.path.exists(host_file_path): - break - time.sleep(0.1) - - if os.path.exists(host_file_path): - post_processed_successfully = _post_process_and_move_info_json( - Path(host_file_path), profile_name, proxy_url, policy, worker_id, - state_manager, profile_manager_instance=profile_manager_instance - ) - else: - logger.warning(f"File from log not found on host for immediate processing: {host_file_path}") - except Exception as e: - logger.error(f"Error during immediate post-processing from log line: {e}") - - with activity_lock: - if post_processed_successfully: - live_success_count += 1 - logger.info(f"[Worker {worker_id}] [{profile_name}] Live success #{live_success_count} detected and post-processed.") - profile_manager_instance.record_activity(profile_name, 'success') - else: - live_failure_count += 1 - logger.error(f"[Worker {worker_id}] [{profile_name}] Post-processing failed for a successful fetch. Recording as failure.") - profile_manager_instance.record_activity(profile_name, 'failure') - # --- End immediate post-processing --- - return False - - # Check for fatal patterns (e.g., bot detection) which might not start with ERROR: - for pattern in fatal_error_patterns: - if re.search(pattern, line, re.IGNORECASE): - with activity_lock: - live_failure_count += 1 - logger.error(f"[Worker {worker_id}] [{profile_name}] Live FATAL error #{live_failure_count} detected from log: {line}") - profile_manager_instance.record_activity(profile_name, 'failure') - if direct_policy.get('ban_on_fatal_error_in_batch'): - logger.warning(f"Banning profile '{profile_name}' immediately due to fatal error to stop container.") - profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during batch') - return True # Signal to stop container - return False # Do not stop if ban_on_fatal_error_in_batch is false - - # Only process lines that contain ERROR: for tolerated/generic failures - if 'ERROR:' not in line: - return False - - # Check if it's a tolerated error - for pattern in tolerated_error_patterns: - if re.search(pattern, line, re.IGNORECASE): - with activity_lock: - live_tolerated_count += 1 - logger.warning(f"[Worker {worker_id}] [{profile_name}] Live TOLERATED error #{live_tolerated_count} detected from log: {line}") - profile_manager_instance.record_activity(profile_name, 'tolerated_error') - return False - - # If it's an ERROR: line and not tolerated, it's a failure - with activity_lock: - live_failure_count += 1 - logger.warning(f"[Worker {worker_id}] [{profile_name}] Live failure #{live_failure_count} detected from log: {line}") - profile_manager_instance.record_activity(profile_name, 'failure') - - return False - - if args.dummy_batch: - logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY BATCH MODE: Simulating Docker batch of {len(url_batch)} URLs.") - logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY BATCH MODE: Would run docker command: {' '.join(shlex.quote(s) for s in command)}") - logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY BATCH MODE: With environment: {environment}") - - dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) - auth_failure_rate = dummy_settings.get('auth_failure_rate', 0.0) - auth_skipped_rate = dummy_settings.get('auth_skipped_failure_rate', 0.0) - min_seconds = dummy_settings.get('auth_min_seconds', 0.1) - max_seconds = dummy_settings.get('auth_max_seconds', 0.5) - - for url in url_batch: - time.sleep(random.uniform(min_seconds, max_seconds)) - video_id = sp_utils.get_video_id(url) or f"dummy_{random.randint(1000, 9999)}" - - rand_val = random.random() - if rand_val < auth_skipped_rate: - logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating tolerated failure for {video_id}.") - live_tolerated_count += 1 - profile_manager_instance.record_activity(profile_name, 'tolerated_error') - elif rand_val < (auth_skipped_rate + auth_failure_rate): - logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating fatal failure for {video_id}.") - live_failure_count += 1 - profile_manager_instance.record_activity(profile_name, 'failure') - else: - # Success - create dummy info.json - live_success_count += 1 - profile_manager_instance.record_activity(profile_name, 'success') - info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True} - env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') - info_data['_ytops_metadata'] = { - 'profile_name': profile_name, 'proxy_url': proxy_url, - 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), - 'auth_env': env_name - } - - final_path = Path(save_dir) / f"{video_id}.info.json" - rename_template = direct_policy.get('rename_file_template') - if rename_template: - sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) - new_name = rename_template.format(video_id=video_id, profile_name=profile_name, proxy=sanitized_proxy) - final_path = Path(save_dir) / new_name - - try: - with open(final_path, 'w', encoding='utf-8') as f: - json.dump(info_data, f, indent=2) - logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY: Created dummy info.json: '{final_path}'") - except IOError as e: - logger.error(f"[Worker {worker_id}] [{profile_name}] DUMMY: Failed to write dummy info.json: {e}") - - retcode = 0 - stdout, stderr, stop_reason = "", "", None - - else: - retcode, stdout, stderr, stop_reason = run_docker_container( - image_name=image_name, - command=command, - volumes=volumes, - stream_prefix=f"[Worker {worker_id} | docker-ytdlp] ", - network_name=network_name, - log_callback=log_parser_callback, - profile_manager=profile_manager_instance, - profile_name=profile_name, - environment=environment, - log_command_override=log_command_override - ) - - # 5. Post-process results - logger.info(f"[Worker {worker_id}] [{profile_name}] Docker container finished. Post-processing results...") - full_output = f"{stdout}\n{stderr}" - is_bot_error = "Sign in to confirm you're not a bot" in full_output - if is_bot_error: - logger.warning(f"[Worker {worker_id}] [{profile_name}] Bot detection occurred during batch. Marking as failure.") - - # Fallback post-processing for any files missed by the live parser. - # The live parser moves files, so this loop should only find leftovers. - # This is no longer needed for dummy mode as it writes directly to the final destination. - if not args.dummy_batch: - processed_files = list(Path(temp_task_dir_host).glob('*.json')) - if processed_files: - logger.info(f"[Worker {worker_id}] Found {len(processed_files)} leftover file(s) to process after live parsing.") - for temp_path in processed_files: - _post_process_and_move_info_json(temp_path, profile_name, proxy_url, policy, worker_id, state_manager, profile_manager_instance=profile_manager_instance) - - # A batch is considered an overall success for logging if it had no fatal errors. - # The per-URL activity has already been recorded live. - # We use live_success_count for a more accurate success metric. - success = (live_success_count > 0 and not is_bot_error) - - if not success: - reason = "bot detection occurred" if is_bot_error else f"0 successful files created out of {len(url_batch)}" - logger.warning(f"[Worker {worker_id}] [{profile_name}] Marking batch as FAILED. Reason: {reason}.") - - # Record batch stats for overall orchestrator health - state_manager.record_batch_result(success, len(url_batch), profile_name=profile_name) - - # If live parsing didn't catch all activity (e.g., yt-dlp exits before printing logs), - # we reconcile the counts here based on files created. - with activity_lock: - processed_count = live_success_count + live_failure_count + live_tolerated_count - - # Failures are harder to reconcile from file counts alone. - # We assume live parsing caught them. The total number of failures is - # (batch_size - files_created), but we don't know if they were already recorded. - # The current live parsing is the most reliable source for failures. - unaccounted_failures = len(url_batch) - processed_count - if unaccounted_failures > 0: - logger.info(f"[Worker {worker_id}] [{profile_name}] Reconciling activity: {unaccounted_failures} unaccounted failure(s).") - for _ in range(unaccounted_failures): - profile_manager_instance.record_activity(profile_name, 'failure') - - - if stop_reason: - logger.warning(f"[Worker {worker_id}] [{profile_name}] Batch aborted due to: {stop_reason}. Adjusting URL index.") - # The batch was from start_idx to end_idx. - # We processed `processed_count` URLs. - # The next batch should start from `start_idx + processed_count`. - # `get_next_url_batch` updated `last_url_index` to `end_idx`. We need to rewind it. - with activity_lock: - processed_count = live_success_count + live_failure_count + live_tolerated_count - next_start_index = start_idx + processed_count - state_manager.update_last_url_index(next_start_index, force=True) - logger.info(f"[Worker {worker_id}] Rewound URL index to {next_start_index} for next worker.") - - event_details = f"Docker batch completed. Exit: {retcode}. Files created: {live_success_count}/{len(url_batch)}. (Live successes: {live_success_count}, Live failures: {live_failure_count}, Live tolerated: {live_tolerated_count})" - if not success and stderr: - event_details += f" Stderr: {stderr.strip().splitlines()[-1] if stderr.strip() else 'N/A'}" - if stop_reason: - event_details += f" Aborted: {stop_reason}." - - event = { 'type': 'fetch_batch', 'profile': profile_name, 'proxy_url': proxy_url, 'success': success, 'details': event_details, 'video_count': len(url_batch) } - state_manager.log_event(event) - - logger.info(f"[Worker {worker_id}] [{profile_name}] Batch processing complete. Worker will now unlock profile and attempt next batch.") - - except Exception as e: - logger.error(f"[Worker {worker_id}] Unexpected error in worker loop: {e}", exc_info=True) - if locked_profile: - profile_manager_instance.record_activity(locked_profile['name'], 'failure') - finally: - if locked_profile and batch_started: - # --- Reconcile pending downloads counter --- - if downloads_per_url > 0: - # The initial increment was (url_batch_len * downloads_per_url). - # The actual number of downloads that will happen is (live_success_count * downloads_per_url). - # The adjustment is the difference. - initial_increment = url_batch_len * downloads_per_url - actual_downloads = live_success_count * downloads_per_url - adjustment = actual_downloads - initial_increment - - if adjustment != 0: - logger.warning(f"[Worker {worker_id}] [{profile_name}] Reconciling pending downloads. Batch created {live_success_count}/{url_batch_len} successful info.json(s). Adjusting counter by {adjustment}.") - profile_manager_instance.increment_pending_downloads(locked_profile['name'], adjustment) - - if locked_profile: - last_used_profile_name = locked_profile['name'] - cooldown = None - # DESIGN: The cooldown duration is not configured in the worker's policy. - # Instead, it is read from a central Redis key. This key is set by the - # policy-enforcer, making the enforcer the single source of truth for - # this policy. This allows changing the cooldown behavior without - # restarting the workers. - cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') - if cooldown_config: - try: - val = json.loads(cooldown_config) - if isinstance(val, list) and len(val) == 2 and val[0] < val[1]: - cooldown = random.randint(val[0], val[1]) - elif isinstance(val, int): - cooldown = val - except (json.JSONDecodeError, TypeError): - if isinstance(cooldown_config, str) and cooldown_config.isdigit(): - cooldown = int(cooldown_config) - - if cooldown: - logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") - - profile_manager_instance.unlock_profile( - locked_profile['name'], - owner=owner_id, - rest_for_seconds=cooldown - ) - if temp_task_dir_host and os.path.exists(temp_task_dir_host): - # If shutdown is requested, a batch might have been interrupted after files were - # created but before they were post-processed. We preserve the temp directory - # to allow for manual recovery of the info.json files. - if state_manager.shutdown_event.is_set() and any(Path(temp_task_dir_host).iterdir()): - logger.warning(f"Shutdown requested. Preserving temporary task directory for manual recovery: {temp_task_dir_host}") - else: - shutil.rmtree(temp_task_dir_host) - - logger.info(f"[Worker {worker_id}] Worker loop finished.") - return [] - - -def run_direct_docker_download_worker(worker_id, policy, state_manager, args, profile_manager_instance, running_processes, process_lock, profile_prefix=None): - """A worker for the 'direct_docker_cli' orchestration mode with `mode: download_only`.""" - logger.info(f"[Worker {worker_id}] Download worker thread started for pool '{profile_prefix or '*'}'.") - try: - owner_id = f"direct-docker-dl-worker-{worker_id}" - settings = policy.get('settings', {}) - exec_control = policy.get('execution_control', {}) - d_policy = policy.get('download_policy', {}) - direct_policy = policy.get('direct_docker_cli_policy', {}) - queue_policy = policy.get('queue_policy') - - # Prioritize the passed-in profile_prefix for worker pool compatibility. - if not profile_prefix: - profile_prefix = d_policy.get('profile_prefix') - - # Unlike other modes, this worker can function without a prefix (it will try to lock any active profile). - # The check `if not profile_prefix` is removed to allow this flexibility. - - # --- Docker specific config --- - image_name = direct_policy.get('docker_image_name') - host_mount_path = direct_policy.get('docker_host_mount_path') - container_mount_path = direct_policy.get('docker_container_mount_path') - host_download_path = direct_policy.get('docker_host_download_path') - container_download_path = direct_policy.get('docker_container_download_path') - network_name = direct_policy.get('docker_network_name') - - if not all([image_name, host_mount_path, container_mount_path, host_download_path, container_download_path]): - logger.error(f"[Worker {worker_id}] Direct docker download mode requires all docker_* keys in 'direct_docker_cli_policy'. Worker exiting.") - return [] - - try: - os.makedirs(host_mount_path, exist_ok=True) - os.makedirs(host_download_path, exist_ok=True) - except OSError as e: - logger.error(f"[Worker {worker_id}] Could not create required host directories: {e}. Worker exiting.") - return [] - - no_task_streak = 0 - last_used_profile_name = None - task_counter = 0 - while not state_manager.shutdown_event.is_set(): - locked_profile = None - claimed_task_path_host = None - temp_config_dir_host = None - was_banned_by_parser = False - task = None - task_id = None - try: - if no_task_streak > 0 and not queue_policy: # Polling only makes sense for file mode - polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - # --- Add diagnostic logging --- - all_profiles_in_pool = profile_manager_instance.list_profiles() - profiles_in_prefix = [p for p in all_profiles_in_pool if p['name'].startswith(profile_prefix or '')] - if profiles_in_prefix: - state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) - states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) - logger.info(f"[Worker {worker_id}] No tasks found or profiles available. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s. (Streak: {no_task_streak})") - else: - logger.info(f"[Worker {worker_id}] No tasks found or profiles available. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s. (Streak: {no_task_streak})") - # --- End diagnostic logging --- - time.sleep(polling_interval) - if state_manager.shutdown_event.is_set(): continue - - # 1. Get a task - if not queue_policy: - # File-based mode: Find a task and lock its associated profile - locked_profile, claimed_task_path_host = find_task_and_lock_profile( - profile_manager_instance, owner_id, profile_prefix, policy, worker_id - ) - else: - # Queue-based mode - task = state_manager.get_download_task() - if task: - task_id = task.get('id') or task.get('task_id') - if not task_id: - task_id = f"dl_task_{worker_id}_{task_counter}" - task_counter += 1 - task['task_id'] = task_id - - info_json_path_str = task.get('info_json_path') - if not info_json_path_str or not os.path.exists(info_json_path_str): - logger.error(f"[Worker {worker_id}] Task {task_id} has invalid info_json_path: {info_json_path_str}. Skipping.") - state_manager.report_download_skipped(task_id, {"error": "Invalid info_json_path", "task": task}) - - auth_profile_name = task.get('auth_profile_name') - auth_env = task.get('auth_env') - if auth_profile_name and auth_env: - auth_manager = get_auth_manager(profile_manager_instance, auth_env) - if auth_manager: - auth_manager.decrement_pending_downloads(auth_profile_name) - continue - - claimed_task_path_host = Path(info_json_path_str) - - # Now lock a profile - specific_profile = task.get('auth_profile_name') or task.get('profile_name') - if specific_profile: - locked_profile = profile_manager_instance.lock_profile(owner=owner_id, specific_profile_name=specific_profile) - if not locked_profile: - logger.warning(f"[Worker {worker_id}] Could not lock specific profile '{specific_profile}'. Trying any profile with prefix.") - locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) - else: - locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) - - if not locked_profile: - logger.warning(f"[Worker {worker_id}] No profiles available for task {task_id}. Re-queueing.") - state_manager.add_download_tasks_batch([task]) - claimed_task_path_host = None - task = None - - if task: - state_manager.mark_download_in_progress(task_id, owner_id) - - if not locked_profile: - if not queue_policy: - no_task_streak += 1 - else: - # In queue mode, if we didn't get a task or a profile, we just poll. - polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - logger.debug(f"[Worker {worker_id}] No download tasks or profiles available. Sleeping for {polling_interval}s.") - time.sleep(polling_interval) - continue - - profile_name = locked_profile['name'] - # We have a task and a lock. - - # User-Agent is not used for download simulation. - user_agent = None - - if claimed_task_path_host: - no_task_streak = 0 - auth_profile_name, auth_env = None, None - info_data = None - - # --- Read info.json content and metadata first --- - try: - with open(claimed_task_path_host, 'r', encoding='utf-8') as f: - info_data = json.load(f) - # This is critical for decrementing the counter in the finally block - metadata = info_data.get('_ytops_metadata', {}) - auth_profile_name = metadata.get('profile_name') - auth_env = metadata.get('auth_env') - except (IOError, json.JSONDecodeError) as e: - logger.error(f"CRITICAL: Could not read or parse task file '{claimed_task_path_host.name}': {e}. This task will be skipped, but the pending downloads counter CANNOT be decremented.") - continue # Skip to finally block to unlock profile - - if args.dummy or args.dummy_batch: - logger.info(f"========== [Worker {worker_id}] BEGIN DUMMY DOCKER DOWNLOAD PER-FORMAT SIMULATION ==========") - logger.info(f"[Worker {worker_id}] Profile: {profile_name} | Task: {claimed_task_path_host.name}") - - dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) - min_seconds = dummy_settings.get('download_min_seconds', 1.0) - max_seconds = dummy_settings.get('download_max_seconds', 3.0) - failure_rate = dummy_settings.get('download_failure_rate', 0.0) - skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0) - - # In dummy mode, prioritize the format from the task file, then from the policy. - format_selection = info_data.get('_ytops_download_format') - source_of_format = "task file" - if not format_selection: - format_selection = d_policy.get('formats', '') - source_of_format = "policy" - - if not format_selection: - logger.warning(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download.") - formats_to_test = ['dummy_format'] - else: - formats_to_test = [f.strip() for f in format_selection.split(',') if f.strip()] - logger.info(f"[Worker {worker_id}] DUMMY: Simulating downloads for formats (from {source_of_format}): {', '.join(formats_to_test)}") - - for format_id in formats_to_test: - if state_manager.shutdown_event.is_set(): - logger.info(f"[Worker {worker_id}] DUMMY: Shutdown requested, stopping format simulation.") - break - - logger.info(f"[Worker {worker_id}] DUMMY: Simulating download for format '{format_id}'...") - time.sleep(random.uniform(min_seconds, max_seconds)) - - rand_val = random.random() - should_fail_skipped = rand_val < skipped_rate - should_fail_fatal = not should_fail_skipped and rand_val < (skipped_rate + failure_rate) - - success = False - details = "" - error_type = None - is_tolerated_error = False - - if should_fail_skipped: - logger.warning(f"[Worker {worker_id}] DUMMY: Simulating skipped download failure for format '{format_id}'.") - details = f"Dummy skipped failure for format {format_id}" - error_type = "DummySkippedFailure" - is_tolerated_error = True - profile_manager_instance.record_activity(profile_name, 'tolerated_error') - elif should_fail_fatal: - logger.warning(f"[Worker {worker_id}] DUMMY: Simulating fatal download failure for format '{format_id}'.") - details = f"Dummy fatal failure for format {format_id}" - error_type = "DummyFailure" - profile_manager_instance.record_activity(profile_name, 'download_error') - else: - logger.info(f"[Worker {worker_id}] DUMMY: Simulating download success for format '{format_id}'.") - success = True - details = f"Dummy success for format {format_id}" - profile_manager_instance.record_activity(profile_name, 'download') - - event = { - 'type': 'direct_docker_download', - 'profile': profile_name, - 'proxy_url': locked_profile['proxy'], - 'success': success, - 'details': details, - 'error_type': error_type, - 'is_tolerated_error': is_tolerated_error, - 'format': format_id - } - state_manager.log_event(event) - - logger.info(f"========== [Worker {worker_id}] END DUMMY DOCKER DOWNLOAD SIMULATION ==========") - - # In dummy mode, we just rename the file to processed and continue to the finally block. - try: - base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0] - processed_path = Path(f"{base_path_str}.processed") - claimed_task_path_host.rename(processed_path) - logger.debug(f"DUMMY MODE: Renamed processed task file to '{processed_path.name}'.") - except (OSError, IndexError) as e: - logger.error(f"DUMMY MODE: Failed to rename processed task file '{claimed_task_path_host}': {e}") - - continue # Skip to finally block - - # --- Check for URL expiration before running Docker --- - if d_policy.get('check_url_expiration', True): - # Heuristic: check the first available format URL - first_format = next((f for f in info_data.get('formats', []) if 'url' in f), None) - if first_format: - url_to_check = first_format['url'] - time_shift_minutes = d_policy.get('expire_time_shift_minutes', 0) - status, time_left_seconds = sp_utils.check_url_expiry(url_to_check, time_shift_minutes) - - logger.debug(f"[Worker {worker_id}] [{profile_name}] URL expiration check for task '{claimed_task_path_host.name}': status={status}, time_left={time_left_seconds:.0f}s") - - if status == 'expired': - details = "Download URL is expired" - if time_shift_minutes > 0 and time_left_seconds > 0: - logger.warning(f"[Worker {worker_id}] [{profile_name}] Skipping task '{claimed_task_path_host.name}' because its URL will expire in {time_left_seconds/60:.1f}m (within {time_shift_minutes}m time-shift).") - details = f"URL will expire within {time_shift_minutes}m time-shift" - else: - logger.warning(f"[Worker {worker_id}] [{profile_name}] Skipping task '{claimed_task_path_host.name}' because its URL is expired.") - - profile_manager_instance.record_activity(profile_name, 'tolerated_error') - - event = { - 'type': 'direct_docker_download', 'profile': profile_name, - 'proxy_url': locked_profile['proxy'], 'success': False, - 'error_type': 'Skipped (Expired URL)', 'details': details, - 'is_tolerated_error': True - } - state_manager.log_event(event) - - try: - base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0] - processed_path = Path(f"{base_path_str}.processed") - claimed_task_path_host.rename(processed_path) - logger.debug(f"Renamed expired task file to '{processed_path.name}'.") - except (OSError, IndexError) as e: - logger.error(f"Failed to rename expired task file '{claimed_task_path_host}': {e}") - - continue # Skip to the finally block - - # The path to the task file inside the container needs to be relative to the host mount root. - # We must make the task path absolute first to correctly calculate the relative path from the absolute mount path. - relative_task_path = os.path.relpath(os.path.abspath(claimed_task_path_host), host_mount_path) - task_path_container = os.path.join(container_mount_path, relative_task_path) - - # 3. Prepare config file on host in a temporary directory - temp_config_dir_host = tempfile.mkdtemp(prefix=f"docker-dl-config-{worker_id}-", dir=host_mount_path) - config_dir_name = os.path.basename(temp_config_dir_host) - config_dir_container = os.path.join(container_mount_path, config_dir_name) - - environment = {} - - base_config_content = "" - base_config_file = direct_policy.get('ytdlp_config_file') - if base_config_file: - config_path_to_read = Path(base_config_file) - if not config_path_to_read.exists(): - config_path_to_read = Path(sp_utils._PROJECT_ROOT) / base_config_file - if config_path_to_read.exists(): - try: - with open(config_path_to_read, 'r', encoding='utf-8') as base_f: - base_config_content = base_f.read() - except IOError as e: - logger.error(f"[Worker {worker_id}] Could not read ytdlp_config_file '{config_path_to_read}': {e}") - - config_overrides = direct_policy.get('ytdlp_config_overrides', {}).copy() - config_overrides['proxy'] = locked_profile['proxy'] - config_overrides['load-info-json'] = task_path_container - config_overrides['output'] = os.path.join(container_download_path, '%(id)s.f%(format_id)s.%(ext)s') - - # Prevent yt-dlp from using a cache directory. - config_overrides['no-cache-dir'] = True - - overrides_content = sp_utils._config_dict_to_flags_file_content(config_overrides) - raw_args_from_policy = direct_policy.get('ytdlp_raw_args', []) - raw_args_content = '\n'.join(raw_args_from_policy) - - config_content = f"{base_config_content.strip()}\n\n# --- Overrides from policy ---\n{overrides_content}" - if raw_args_content: - config_content += f"\n\n# --- Raw args from policy ---\n{raw_args_content}" - - logger.info(f"[Worker {worker_id}] [{profile_name}] Generated yt-dlp config:\n---config---\n{config_content}\n------------") - - ytdlp_config_dir_host = os.path.join(temp_config_dir_host, 'yt-dlp') - os.makedirs(ytdlp_config_dir_host, exist_ok=True) - temp_config_file_host = os.path.join(ytdlp_config_dir_host, 'config') - with open(temp_config_file_host, 'w', encoding='utf-8') as f: - f.write(config_content) - - # 4. Construct and run docker run command - volumes = { - os.path.abspath(host_mount_path): {'bind': container_mount_path, 'mode': 'ro'}, - os.path.abspath(host_download_path): {'bind': container_download_path, 'mode': 'rw'} - } - # The command tells yt-dlp exactly where to find the config file we created. - command = ['yt-dlp', '--config-locations', os.path.join(config_dir_container, 'yt-dlp/config')] - logger.info(f"[Worker {worker_id}] [{profile_name}] Running docker command: {' '.join(shlex.quote(s) for s in command)}") - - # For logging purposes, construct the full equivalent command line with host paths - log_config_overrides_for_host = config_overrides.copy() - log_config_overrides_for_host['load-info-json'] = str(claimed_task_path_host) - log_config_overrides_for_host['output'] = os.path.join(host_download_path, '%(id)s.f%(format_id)s.%(ext)s') - - log_command_override = ['yt-dlp'] - if base_config_content: - log_command_override.extend(sp_utils._parse_config_file_to_cli_args(base_config_content)) - log_command_override.extend(sp_utils._config_dict_to_cli_flags(log_config_overrides_for_host)) - raw_args_from_policy = direct_policy.get('ytdlp_raw_args', []) - for raw_arg in raw_args_from_policy: - log_command_override.extend(shlex.split(raw_arg)) - - # --- Live log parsing and activity recording --- - live_success_count = 0 - live_failure_count = 0 - live_tolerated_count = 0 - activity_lock = threading.Lock() - - tolerated_error_patterns = direct_policy.get('tolerated_error_patterns', []) - fatal_error_patterns = direct_policy.get('fatal_error_patterns', []) - - def log_parser_callback(line): - nonlocal live_success_count, live_failure_count, live_tolerated_count, was_banned_by_parser - - # Success is a high-priority check. Only record one success per task. - if '[download] 100% of' in line or 'has already been downloaded' in line: - with activity_lock: - # Only count one success per task - if live_success_count == 0: - live_success_count += 1 - logger.info(f"[Worker {worker_id}] [{profile_name}] Live download success detected from log.") - profile_manager_instance.record_activity(profile_name, 'download') - return False - - # Check for fatal patterns - for pattern in fatal_error_patterns: - if re.search(pattern, line, re.IGNORECASE): - with activity_lock: - live_failure_count += 1 - logger.error(f"[Worker {worker_id}] [{profile_name}] Live FATAL download error #{live_failure_count} detected from log: {line}") - profile_manager_instance.record_activity(profile_name, 'download_error') - if direct_policy.get('ban_on_fatal_error_in_batch'): - logger.warning(f"Banning profile '{profile_name}' immediately due to fatal download error to stop container.") - profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during download') - was_banned_by_parser = True - return True # Signal to stop container - return False # Do not stop if ban_on_fatal_error_in_batch is false - - # Only process lines that contain ERROR: for tolerated/generic failures - if 'ERROR:' not in line: - return False - - # Check if it's a tolerated error - for pattern in tolerated_error_patterns: - if re.search(pattern, line, re.IGNORECASE): - with activity_lock: - live_tolerated_count += 1 - logger.warning(f"[Worker {worker_id}] [{profile_name}] Live TOLERATED download error #{live_tolerated_count} detected from log: {line}") - profile_manager_instance.record_activity(profile_name, 'tolerated_error') - return False - - # If it's an ERROR: line and not tolerated, it's a failure - with activity_lock: - live_failure_count += 1 - logger.warning(f"[Worker {worker_id}] [{profile_name}] Live download failure #{live_failure_count} detected from log: {line}") - profile_manager_instance.record_activity(profile_name, 'download_error') - - return False - - retcode, stdout, stderr, stop_reason = run_docker_container( - image_name=image_name, - command=command, - volumes=volumes, - stream_prefix=f"[Worker {worker_id} | docker-ytdlp] ", - network_name=network_name, - log_callback=log_parser_callback, - profile_manager=profile_manager_instance, - profile_name=profile_name, - environment=environment, - log_command_override=log_command_override - ) - - # 5. Post-process and record activity - full_output = f"{stdout}\n{stderr}" - is_bot_error = "Sign in to confirm you're not a bot" in full_output - if is_bot_error: - logger.warning(f"[Worker {worker_id}] [{profile_name}] Bot detection occurred during download. Marking as failure.") - - # --- Final Outcome Determination --- - # Activity is now recorded live by the log parser. This block just determines - # the overall success/failure for logging and event reporting. - success = False - final_outcome = "unknown" - with activity_lock: - if live_success_count > 0: - success = True - final_outcome = "download" - elif live_failure_count > 0 or is_bot_error: - final_outcome = "download_error" - elif live_tolerated_count > 0: - final_outcome = "tolerated_error" - elif retcode == 0: - # Fallback if no logs were matched but exit was clean. - success = True - final_outcome = "download" - logger.warning(f"[Worker {worker_id}] [{profile_name}] No specific success/error log line matched, but exit code is 0. Assuming success, but this may indicate a parsing issue.") - # We record a success here as a fallback, in case the log parser missed it. - profile_manager_instance.record_activity(profile_name, 'download') - else: - final_outcome = "download_error" - logger.warning(f"[Worker {worker_id}] [{profile_name}] No specific error log line matched, but exit code was {retcode}. Recording a generic download_error.") - profile_manager_instance.record_activity(profile_name, 'download_error') - - # --- Airflow Directory Logic --- - if success and d_policy.get('output_to_airflow_ready_dir'): - try: - # Get video_id from the info.json - with open(claimed_task_path_host, 'r', encoding='utf-8') as f: - info_data = json.load(f) - video_id = info_data.get('id') - - if not video_id: - logger.error(f"[{profile_name}] Could not find video ID in '{claimed_task_path_host.name}' for moving files.") - else: - # Scan the download directory for all resulting media files - downloaded_files = [f for f in os.listdir(host_download_path) if not f.endswith(('.part', '.ytdl'))] - logger.info(f"[{profile_name}] Found {len(downloaded_files)} downloaded file(s) to process in '{host_download_path}'.") - - if not downloaded_files: - logger.warning(f"[{profile_name}] Download reported success, but no media files were found in '{host_download_path}'.") - - # --- Prepare the single destination directory for all artifacts --- - now = datetime.now() - rounded_minute = (now.minute // 10) * 10 - timestamp_str = now.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}" - - base_path = d_policy.get('airflow_ready_dir_base_path', 'downloadfiles/videos/ready') - if not os.path.isabs(base_path): - base_path = os.path.join(sp_utils._PROJECT_ROOT, base_path) - final_dir_base = os.path.join(base_path, timestamp_str) - final_dir_path = os.path.join(final_dir_base, video_id) - os.makedirs(final_dir_path, exist_ok=True) - - # --- Copy info.json once --- - new_info_json_name = f"info_{video_id}.json" - dest_info_json_path = os.path.join(final_dir_path, new_info_json_name) - if not os.path.exists(dest_info_json_path): - shutil.copy(claimed_task_path_host, dest_info_json_path) - logger.info(f"[{profile_name}] Copied info.json to {dest_info_json_path}") - - # --- Process each downloaded media file --- - for downloaded_filename in downloaded_files: - downloaded_file_host_path = os.path.join(host_download_path, downloaded_filename) - if not os.path.exists(downloaded_file_host_path): - logger.warning(f"[{profile_name}] File '{downloaded_filename}' disappeared before it could be processed.") - continue - - media_path = Path(downloaded_file_host_path) - - # 1. Run ffprobe - if d_policy.get('run_ffprobe'): - ffprobe_filename = f"ffprobe_{media_path.stem}.json" - # Create ffprobe json in the final destination to avoid moving it - ffprobe_output_path = Path(final_dir_path) / ffprobe_filename - _run_ffprobe(media_path, ffprobe_output_path) - - # 2. Move media file - final_media_path = Path(final_dir_path) / media_path.name - shutil.move(str(media_path), str(final_media_path)) - logger.info(f"[{profile_name}] Moved media file '{media_path.name}' to {final_media_path}") - media_path = final_media_path # Update media_path to its new location - - # 3. Cleanup the media file at its final location - if d_policy.get('cleanup'): - if os.path.exists(media_path): - _cleanup_media_file(media_path) - - except Exception as e: - logger.error(f"[{profile_name}] Failed during post-download processing for Airflow: {e}", exc_info=True) - - event_details = f"Docker download finished. Exit: {retcode}. Final Outcome: {final_outcome}. (Live successes: {live_success_count}, Live failures: {live_failure_count}, Live tolerated: {live_tolerated_count})" - if not success and stderr: - event_details += f" Stderr: {stderr.strip().splitlines()[-1] if stderr.strip() else 'N/A'}" - if stop_reason: - event_details += f" Aborted: {stop_reason}." - - event = { 'type': 'direct_docker_download', 'profile': profile_name, 'proxy_url': locked_profile['proxy'], 'success': success, 'details': event_details } - state_manager.log_event(event) - - logger.info(f"[Worker {worker_id}] [{profile_name}] Task processing complete. Worker will now unlock profile and attempt next task.") - - # 6. Clean up task file - if not queue_policy: - # File-based mode: rename to .processed - try: - # The claimed_task_path_host has a .LOCKED suffix, remove it before adding .processed - base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0] - processed_path = Path(f"{base_path_str}.processed") - claimed_task_path_host.rename(processed_path) - logger.debug(f"[{sp_utils.get_display_name(claimed_task_path_host)}] Renamed processed task file to '{processed_path.name}'.") - except (OSError, IndexError) as e: - logger.error(f"Failed to rename processed task file '{claimed_task_path_host}': {e}") - elif d_policy.get('rename_source_info_json_on_success'): - # Queue-based mode: respect rename policy - source_path_to_rename = task.get('info_json_path') - if success and source_path_to_rename and os.path.exists(source_path_to_rename): - try: - processed_path = source_path_to_rename + ".processed" - shutil.move(source_path_to_rename, processed_path) - logger.info(f"[Worker {worker_id}] Renamed source info.json to '{processed_path}'") - except Exception as e: - logger.warning(f"[Worker {worker_id}] Could not rename source info.json '{source_path_to_rename}': {e}") - # After this point, claimed_task_path_host is no longer valid. - # The metadata has already been read into auth_profile_name and auth_env. - else: - # This case should not be reached with the new task-first locking logic. - logger.warning(f"[Worker {worker_id}] Inconsistent state: locked profile '{profile_name}' but no task was claimed. Unlocking and continuing.") - - except Exception as e: - logger.error(f"[Worker {worker_id}] An unexpected error occurred in the worker loop: {e}", exc_info=True) - if locked_profile: - profile_manager_instance.record_activity(locked_profile['name'], 'failure') # Generic failure - time.sleep(5) - finally: - if locked_profile: - if claimed_task_path_host: - # The auth_profile_name and auth_env variables were populated in the `try` block - # before the task file was renamed or deleted. - if auth_profile_name and auth_env: - auth_manager = get_auth_manager(profile_manager_instance, auth_env) - if auth_manager: - auth_manager.decrement_pending_downloads(auth_profile_name) - else: - logger.error(f"Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.") - else: - logger.warning(f"Could not find auth profile name and/or auth_env in info.json metadata. Pending downloads counter will not be decremented. (Profile: {auth_profile_name}, Env: {auth_env})") - - if was_banned_by_parser: - logger.info(f"[Worker {worker_id}] Profile '{locked_profile['name']}' was already banned by the log parser. Skipping unlock/cooldown.") - else: - last_used_profile_name = locked_profile['name'] - cooldown = None - # Only apply cooldown if a task was actually claimed and processed. - if claimed_task_path_host: - # Enforcer is the only point where we configure to apply different policies, - # since we might restart enforcer, but won't restart stress-policy working on auth and downloads simultaneously. - # This is like applying a policy across multiple workers/machines without needing to restart each of them. - # DESIGN: The cooldown duration is not configured in the worker's policy. - # Instead, it is read from a central Redis key. This key is set by the - # policy-enforcer, making the enforcer the single source of truth for - # this policy. This allows changing the cooldown behavior without - # restarting the workers. - cooldown_source_value = profile_manager_instance.get_config('unlock_cooldown_seconds') - source_description = "Redis config" - - if cooldown_source_value is None: - cooldown_source_value = d_policy.get('default_unlock_cooldown_seconds') - source_description = "local policy" - - if cooldown_source_value is not None: - try: - # If from Redis, it's a string that needs parsing. - # If from local policy, it's already an int or list. - val = cooldown_source_value - if isinstance(val, str): - val = json.loads(val) - - if isinstance(val, list) and len(val) == 2 and val[0] < val[1]: - cooldown = random.randint(val[0], val[1]) - elif isinstance(val, int): - cooldown = val - - if cooldown is not None: - logger.debug(f"Determined cooldown from {source_description}: {cooldown_source_value}") - - except (json.JSONDecodeError, TypeError): - if isinstance(cooldown_source_value, str) and cooldown_source_value.isdigit(): - cooldown = int(cooldown_source_value) - logger.debug(f"Determined cooldown from {source_description}: {cooldown_source_value}") - - if cooldown: - logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") - - profile_manager_instance.unlock_profile( - locked_profile['name'], - owner=owner_id, - rest_for_seconds=cooldown - ) - if not queue_policy and claimed_task_path_host and os.path.exists(claimed_task_path_host): - # Clean up .LOCKED file in file-based mode - try: os.remove(claimed_task_path_host) - except OSError: pass - - if task and task_id: - state_manager.remove_download_in_progress(task_id) - - if temp_config_dir_host and os.path.exists(temp_config_dir_host): - try: - shutil.rmtree(temp_config_dir_host) - except OSError: pass - - logger.info(f"[Worker {worker_id}] Worker loop finished.") - return [] - except Exception as e: - logger.error(f"[Worker {worker_id}] A fatal, unhandled error occurred in the worker thread: {e}", exc_info=True) - return [] - - -def run_direct_download_worker(worker_id, policy, state_manager, args, profile_manager_instance, running_processes, process_lock, profile_prefix=None): - """A persistent worker for the 'direct_download_cli' orchestration mode.""" - owner_id = f"direct-dl-worker-{worker_id}" - settings = policy.get('settings', {}) - exec_control = policy.get('execution_control', {}) - d_policy = policy.get('download_policy', {}) - direct_policy = policy.get('direct_download_cli_policy', {}) - - # Prioritize the passed-in profile_prefix for worker pool compatibility. - if not profile_prefix: - profile_prefix = d_policy.get('profile_prefix') - - # Unlike other modes, this worker can function without a prefix (it will try to lock any active profile). - # The check `if not profile_prefix` is removed to allow this flexibility. - - output_dir = direct_policy.get('output_dir') - if not output_dir: - logger.error(f"[Worker {worker_id}] Direct download mode requires 'direct_download_cli_policy.output_dir'. Worker exiting.") - return [] - - os.makedirs(output_dir, exist_ok=True) - no_task_streak = 0 - - while not state_manager.shutdown_event.is_set(): - locked_profile = None - claimed_task_path = None - try: - # 0. If no tasks were found, pause briefly. - if no_task_streak > 0: - polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - # --- Add diagnostic logging --- - all_profiles_in_pool = profile_manager_instance.list_profiles() - profiles_in_prefix = [p for p in all_profiles_in_pool if p['name'].startswith(profile_prefix or '')] - if profiles_in_prefix: - state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) - states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) - logger.info(f"[Worker {worker_id}] No tasks found for available profiles. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s. (Streak: {no_task_streak})") - else: - logger.info(f"[Worker {worker_id}] No tasks found for available profiles. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s. (Streak: {no_task_streak})") - # --- End diagnostic logging --- - time.sleep(polling_interval) - if state_manager.shutdown_event.is_set(): continue - - # 1. Find a task and lock its associated profile - locked_profile, claimed_task_path = find_task_and_lock_profile( - profile_manager_instance, owner_id, profile_prefix, policy, worker_id - ) - - if not locked_profile: - no_task_streak += 1 - # The main loop will pause if the streak continues. - continue - - profile_name = locked_profile['name'] - # We have a task and a lock. - - if claimed_task_path: - no_task_streak = 0 # Reset streak - auth_profile_name, auth_env = None, None - - # --- Read metadata before processing/deleting file --- - try: - with open(claimed_task_path, 'r', encoding='utf-8') as f: - info_data = json.load(f) - metadata = info_data.get('_ytops_metadata', {}) - auth_profile_name = metadata.get('profile_name') - auth_env = metadata.get('auth_env') - except (IOError, json.JSONDecodeError) as e: - logger.error(f"Could not read info.json to get auth profile for decrementing counter: {e}") - - # 3. Construct and run the command - ytdlp_cmd_str = direct_policy.get('ytdlp_command') - if not ytdlp_cmd_str: - logger.error(f"[Worker {worker_id}] Direct download mode requires 'direct_download_cli_policy.ytdlp_command'.") - break - - proxy_url = locked_profile['proxy'] - proxy_rename = direct_policy.get('proxy_rename') - if proxy_rename: - rename_rule = proxy_rename.strip("'\"") - if rename_rule.startswith('s/') and rename_rule.count('/') >= 2: - try: - parts = rename_rule.split('/') - proxy_url = re.sub(parts[1], parts[2], proxy_url) - except (re.error, IndexError): - logger.error(f"[Worker {worker_id}] Invalid proxy_rename rule: {proxy_rename}") - - output_template = os.path.join(output_dir, '%(title)s - %(id)s.%(ext)s') - - cmd = shlex.split(ytdlp_cmd_str) - cmd.extend(['--load-info-json', str(claimed_task_path)]) - cmd.extend(['--proxy', proxy_url]) - cmd.extend(['-o', output_template]) - - ytdlp_args = direct_policy.get('ytdlp_args') - if ytdlp_args: - cmd.extend(shlex.split(ytdlp_args)) - - if args.verbose and '--verbose' not in cmd: - cmd.append('--verbose') - - custom_env = direct_policy.get('env_vars', {}).copy() - - # --- PYTHONPATH for custom yt-dlp module --- - ytdlp_module_path = direct_policy.get('ytdlp_module_path') - if ytdlp_module_path: - existing_pythonpath = custom_env.get('PYTHONPATH', os.environ.get('PYTHONPATH', '')) - custom_env['PYTHONPATH'] = f"{ytdlp_module_path}{os.pathsep}{existing_pythonpath}".strip(os.pathsep) - logger.debug(f"[Worker {worker_id}] Using custom PYTHONPATH: {custom_env['PYTHONPATH']}") - - # Pass profile info to the custom yt-dlp process - custom_env['YTDLP_PROFILE_NAME'] = profile_name - custom_env['YTDLP_PROXY_URL'] = locked_profile['proxy'] # Original proxy - env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') - custom_env['YTDLP_SIM_MODE'] = env_name - - # Create a per-profile cache directory and set XDG_CACHE_HOME - cache_dir_base = direct_policy.get('cache_dir_base', '.cache') - profile_cache_dir = os.path.join(cache_dir_base, profile_name) - try: - os.makedirs(profile_cache_dir, exist_ok=True) - custom_env['XDG_CACHE_HOME'] = profile_cache_dir - except OSError as e: - logger.error(f"[Worker {worker_id}] Failed to create cache directory '{profile_cache_dir}': {e}") - - logger.info(f"[Worker {worker_id}] [{profile_name}] Processing task '{claimed_task_path.name}'...") - if args.dummy or args.dummy_batch: - logger.info(f"========== [Worker {worker_id}] BEGIN DUMMY DIRECT DOWNLOAD ==========") - logger.info(f"[Worker {worker_id}] Profile: {profile_name} | Task: {claimed_task_path.name}") - logger.info(f"[Worker {worker_id}] Would run command: {' '.join(shlex.quote(s) for s in cmd)}") - logger.info(f"[Worker {worker_id}] With environment: {custom_env}") - - dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) - min_seconds = dummy_settings.get('download_min_seconds', 0.5) - max_seconds = dummy_settings.get('download_max_seconds', 1.5) - failure_rate = dummy_settings.get('download_failure_rate', 0.0) - skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0) - - time.sleep(random.uniform(min_seconds, max_seconds)) - - rand_val = random.random() - should_fail_skipped = rand_val < skipped_rate - should_fail_fatal = not should_fail_skipped and rand_val < (skipped_rate + failure_rate) - - if should_fail_skipped: - logger.warning(f"[Worker {worker_id}] DUMMY: Simulating skipped download failure.") - profile_manager_instance.record_activity(profile_name, 'tolerated_error') - retcode = 0 - stderr = "Dummy skipped failure" - elif should_fail_fatal: - logger.warning(f"[Worker {worker_id}] DUMMY: Simulating fatal download failure.") - profile_manager_instance.record_activity(profile_name, 'download_error') - retcode = 1 - stderr = "Dummy fatal failure" - else: - logger.info(f"[Worker {worker_id}] DUMMY: Simulating download success.") - profile_manager_instance.record_activity(profile_name, 'download') - retcode = 0 - stderr = "" - logger.info(f"========== [Worker {worker_id}] END DUMMY DIRECT DOWNLOAD ==========") - else: - logger.info(f"[Worker {worker_id}] [{profile_name}] Running command: {' '.join(shlex.quote(s) for s in cmd)}") - logger.info(f"[Worker {worker_id}] [{profile_name}] With environment: {custom_env}") - retcode, stdout, stderr = run_command( - cmd, running_processes, process_lock, env=custom_env, stream_output=args.verbose, - stream_prefix=f"[Worker {worker_id} | yt-dlp] " - ) - - # 4. Record activity - if not (args.dummy or args.dummy_batch): - success = (retcode == 0) - activity_type = 'download' if success else 'download_error' - logger.info(f"[Worker {worker_id}] Recording '{activity_type}' for profile '{profile_name}'.") - profile_manager_instance.record_activity(profile_name, activity_type) - - event_details = f"Download finished. Exit code: {retcode}." - if retcode != 0 and stderr: - event_details += f" Stderr: {stderr.strip().splitlines()[-1]}" - - event = {'type': 'direct_download', 'profile': profile_name, 'proxy_url': proxy_url, 'success': (retcode == 0), 'details': event_details} - state_manager.log_event(event) - - # 5. Clean up the processed task file - try: - os.remove(claimed_task_path) - logger.debug(f"[{sp_utils.get_display_name(claimed_task_path)}] Removed processed task file.") - except OSError as e: - logger.error(f"Failed to remove processed task file '{claimed_task_path}': {e}") - else: - no_task_streak += 1 - logger.info(f"[Worker {worker_id}] No tasks found for profile '{profile_name}'.") - - except Exception as e: - logger.error(f"[Worker {worker_id}] An unexpected error occurred in the worker loop: {e}", exc_info=True) - if locked_profile: - profile_manager_instance.record_activity(locked_profile['name'], 'failure') # Generic failure - time.sleep(5) - finally: - if locked_profile: - if claimed_task_path: - # The auth_profile_name and auth_env variables were populated in the `try` block - # before the task file was deleted. - if auth_profile_name and auth_env: - auth_manager = get_auth_manager(profile_manager_instance, auth_env) - if auth_manager: - auth_manager.decrement_pending_downloads(auth_profile_name) - else: - logger.error(f"Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.") - else: - logger.warning(f"Could not find auth profile name and/or auth_env in info.json metadata. Pending downloads counter will not be decremented. (Profile: {auth_profile_name}, Env: {auth_env})") - - cooldown = None - if claimed_task_path: - # Enforcer is the only point where we configure to apply different policies, - # since we might restart enforcer, but won't restart stress-policy working on auth and downloads simultaneously. - # This is like applying a policy across multiple workers/machines without needing to restart each of them. - # DESIGN: The cooldown duration is not configured in the worker's policy. - # Instead, it is read from a central Redis key. This key is set by the - # policy-enforcer, making the enforcer the single source of truth for - # this policy. This allows changing the cooldown behavior without - # restarting the workers. - cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') - if cooldown_config: - try: - val = json.loads(cooldown_config) - if isinstance(val, list) and len(val) == 2 and val[0] < val[1]: - cooldown = random.randint(val[0], val[1]) - elif isinstance(val, int): - cooldown = val - except (json.JSONDecodeError, TypeError): - if cooldown_config.isdigit(): - cooldown = int(cooldown_config) - - if cooldown: - logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") - - profile_manager_instance.unlock_profile( - locked_profile['name'], - owner=owner_id, - rest_for_seconds=cooldown - ) - locked_profile = None - - logger.info(f"[Worker {worker_id}] Worker loop finished.") - return [] +# This file is deprecated and has been refactored. +# Worker functions are now located in their own dedicated modules: +# - direct_batch_worker.py +# - direct_docker_worker.py +# - direct_download_worker.py +# - throughput_worker.py +# - queue_workers.py +# +# Generic helper functions have been moved to worker_utils.py. +# +# This file is kept to avoid breaking old imports but should not be used. diff --git a/ytops_client-source/ytops_client/stress_policy_tool.py b/ytops_client-source/ytops_client/stress_policy_tool.py index 25ee4e4..afed41c 100644 --- a/ytops_client-source/ytops_client/stress_policy_tool.py +++ b/ytops_client-source/ytops_client/stress_policy_tool.py @@ -104,12 +104,14 @@ from .profile_manager_tool import ProfileManager from .stress_policy.state_manager import StateManager from .stress_policy.process_runners import run_command, run_docker_container, get_worker_id from .stress_policy import utils as sp_utils -from .stress_policy.workers import ( - _run_download_logic, process_profile_task, run_download_worker, process_info_json_cycle, - run_throughput_worker, _post_process_and_move_info_json, run_direct_batch_worker, - run_direct_docker_worker, find_task_and_lock_profile, run_direct_docker_download_worker, - run_direct_download_worker -) +# This block replaces the obsolete import from the deprecated `workers.py` file. +# Worker functions are now imported from their own dedicated modules as per the refactoring. +from .stress_policy.direct_batch_worker import run_direct_batch_worker +from .stress_policy.direct_docker_worker import run_direct_docker_worker, run_direct_docker_download_worker +from .stress_policy.direct_download_worker import run_direct_download_worker +from .stress_policy.throughput_worker import run_throughput_worker +# Helper functions for the legacy "task-first" mode are now in worker_utils.py +from .stress_policy.worker_utils import _run_download_logic, process_profile_task from .stress_policy.queue_workers import ( run_queue_auth_worker, run_queue_download_worker ) @@ -192,18 +194,8 @@ def main_stress_policy(args): if args.profile_prefix: # This shortcut overrides the profile_prefix for all relevant stages. # Useful for simple fetch_only or download_only runs. - - # Ensure info_json_generation_policy is a dict before assigning to it. - # This handles cases where the policy has a non-dict value (like None or a string). - if not isinstance(policy.get('info_json_generation_policy'), dict): - policy['info_json_generation_policy'] = {} - policy['info_json_generation_policy']['profile_prefix'] = args.profile_prefix - - # Ensure download_policy is a dict before assigning to it. - if not isinstance(policy.get('download_policy'), dict): - policy['download_policy'] = {} - policy['download_policy']['profile_prefix'] = args.profile_prefix - + policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = args.profile_prefix + policy.setdefault('download_policy', {})['profile_prefix'] = args.profile_prefix # Use print because logger is not yet configured. print(f"Overriding profile_prefix for all stages with CLI arg: {args.profile_prefix}", file=sys.stderr) @@ -413,6 +405,7 @@ def main_stress_policy(args): redis_host = args.redis_host or os.getenv('REDIS_HOST', os.getenv('MASTER_HOST_IP', 'localhost')) redis_port = args.redis_port if args.redis_port is not None else int(os.getenv('REDIS_PORT', 6379)) redis_password = args.redis_password or os.getenv('REDIS_PASSWORD') + redis_db = args.redis_db if args.redis_db is not None else int(os.getenv('REDIS_DB', 0)) sim_params = policy.get('simulation_parameters', {}) @@ -436,7 +429,8 @@ def main_stress_policy(args): return ProfileManager( redis_host=redis_host, redis_port=redis_port, - redis_password=redis_password, key_prefix=key_prefix + redis_password=redis_password, key_prefix=key_prefix, + redis_db=redis_db ) # Determine which managers are needed based on mode and orchestration mode @@ -470,80 +464,6 @@ def main_stress_policy(args): if len(profile_managers) == 1: profile_manager = list(profile_managers.values())[0] - # --- Worker Launching Logic --- - # This block determines how many workers to launch and which function to run. - # It centralizes the logic for handling worker_pools vs. legacy workers setting. - - # Check if the user explicitly set execution_control.workers via the CLI. - # This gives the CLI override precedence over the worker_pools config in the file. - cli_overrode_workers = any('execution_control.workers' in s for s in args.set) - - worker_pools = exec_control.get('worker_pools') - use_worker_pools = worker_pools and not cli_overrode_workers - - total_workers = 0 - worker_configs = [] # List of {'target': function, 'kwargs': {}} - - # Determine the target worker function based on orchestration mode - target_worker_func = None - manager_for_worker = None - urls_list = [] - - if orchestration_mode == 'throughput': - target_worker_func = run_throughput_worker - manager_for_worker = profile_managers.get('download') - elif orchestration_mode == 'direct_batch_cli': - target_worker_func = run_direct_batch_worker - use_env = policy.get('direct_batch_cli_policy', {}).get('use_profile_env', 'auth') - manager_for_worker = profile_managers.get(use_env) - elif orchestration_mode == 'direct_docker_cli': - if mode == 'fetch_only': - target_worker_func = run_direct_docker_worker - elif mode == 'download_only': - target_worker_func = run_direct_docker_download_worker - use_env = policy.get('direct_docker_cli_policy', {}).get('use_profile_env', 'auth' if mode == 'fetch_only' else 'download') - manager_for_worker = profile_managers.get(use_env) - elif orchestration_mode == 'direct_download_cli': - target_worker_func = run_direct_download_worker - manager_for_worker = profile_managers.get('download') - # Other modes (queue, task-first) are handled separately below. - - if use_worker_pools: - # New logic: Filter worker pools if a specific profile_prefix is given via CLI - pools_to_run = worker_pools - if args.profile_prefix: - logger.info(f"CLI --profile-prefix '{args.profile_prefix}' provided. Filtering worker pools.") - pools_to_run = [p for p in worker_pools if p.get('profile_prefix') == args.profile_prefix] - if not pools_to_run: - logger.error(f"No worker pool found in policy with profile_prefix matching '{args.profile_prefix}'. Exiting.") - return 1 - - total_workers = sum(p.get('workers', 1) for p in pools_to_run) - worker_idx_counter = 0 - for pool in pools_to_run: - pool_prefix = pool.get('profile_prefix') - num_workers_in_pool = pool.get('workers', 1) - if not pool_prefix: - logger.warning(f"Worker pool found without a 'profile_prefix'. Skipping: {pool}") - continue - for _ in range(num_workers_in_pool): - worker_configs.append({ - 'id': worker_idx_counter, - 'prefix': pool_prefix, - 'pool_info': f"Pool '{pool_prefix}'" - }) - worker_idx_counter += 1 - else: - total_workers = exec_control.get('workers', 1) - if cli_overrode_workers: - logger.info(f"Overriding 'worker_pools' with CLI setting: --set execution_control.workers={total_workers}") - for i in range(total_workers): - worker_configs.append({ - 'id': i, - 'prefix': None, # No specific prefix - 'pool_info': "Legacy 'workers' config" - }) - # --- Throughput Orchestration Mode --- if orchestration_mode == 'throughput': logger.info("--- Throughput Orchestration Mode Enabled ---") @@ -551,44 +471,47 @@ def main_stress_policy(args): logger.error("Orchestration mode 'throughput' is only compatible with 'download_only' mode and 'from_pool_with_lock' profile mode.") return 1 - if not manager_for_worker: + download_manager = profile_managers.get('download') + if not download_manager: logger.error("Throughput mode requires a download profile manager.") return 1 original_workers_setting = exec_control.get('workers') if original_workers_setting == 'auto': - # This logic is complex and specific to this mode, so we keep it here. d_policy = policy.get('download_policy', {}) profile_prefix = d_policy.get('profile_prefix') if not profile_prefix: logger.error("Cannot calculate 'auto' workers for throughput mode without 'download_policy.profile_prefix'.") return 1 - all_profiles = manager_for_worker.list_profiles() + + all_profiles = download_manager.list_profiles() matching_profiles = [p for p in all_profiles if p['name'].startswith(profile_prefix)] calculated_workers = len(matching_profiles) + if calculated_workers == 0: logger.error(f"Cannot use 'auto' workers: No profiles found with prefix '{profile_prefix}'. Please run setup-profiles.") return 1 + exec_control['workers'] = calculated_workers logger.info(f"Calculated 'auto' workers for throughput mode: {calculated_workers} (based on {len(matching_profiles)} profiles with prefix '{profile_prefix}').") - # Recalculate worker configs if 'auto' was used - total_workers = calculated_workers - worker_configs = [{'id': i, 'prefix': None, 'pool_info': "Legacy 'workers' config"} for i in range(total_workers)] sp_utils.display_effective_policy(policy, policy_name, sources=[], original_workers_setting=original_workers_setting) if args.dry_run: return 0 - with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor: - futures = [] - logger.info(f"Launching {total_workers} worker(s)...") - for config in worker_configs: - logger.info(f" - Worker {config['id']}: {config['pool_info']}") - futures.append(executor.submit(target_worker_func, config['id'], policy, state_manager, args, manager_for_worker, running_processes, process_lock, profile_prefix=config['prefix'])) - + workers = exec_control.get('workers', 1) + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + futures = [ + executor.submit(run_throughput_worker, i, policy, state_manager, args, download_manager, running_processes, process_lock) + for i in range(workers) + ] + # Wait for shutdown signal shutdown_event.wait() logger.info("Shutdown signal received, waiting for throughput workers to finish current tasks...") + # The workers will exit their loops upon seeing the shutdown_event. + # We don't need complex shutdown logic here; the main `finally` block will handle summary. concurrent.futures.wait(futures) + # In this mode, the main loop is handled by workers. So we return here. state_manager.print_summary(policy) state_manager.close() return 0 @@ -600,8 +523,13 @@ def main_stress_policy(args): logger.error("Orchestration mode 'direct_batch_cli' is only compatible with 'fetch_only' mode and 'from_pool_with_lock' profile mode.") return 1 - if not manager_for_worker: - logger.error(f"Direct batch CLI mode requires a profile manager, but it was not configured.") + direct_policy = policy.get('direct_batch_cli_policy', {}) + use_env = direct_policy.get('use_profile_env', 'auth') # Default to auth for backward compatibility + + profile_manager_instance = profile_managers.get(use_env) + if not profile_manager_instance: + logger.error(f"Direct batch CLI mode requires a '{use_env}' profile manager, but it was not configured.") + logger.error("Check 'simulation_parameters' in your policy and the 'mode' setting.") return 1 urls_file = settings.get('urls_file') @@ -620,10 +548,13 @@ def main_stress_policy(args): logger.error(f"URL file '{urls_file}' is empty. Nothing to do.") return 1 - start_index = state_manager.get_last_url_index() + # Handle starting from a specific index + start_index = 0 if args.start_from_url_index is not None: start_index = max(0, args.start_from_url_index - 1) state_manager.update_last_url_index(start_index, force=True) + else: + start_index = state_manager.get_last_url_index() if start_index >= len(urls_list) and len(urls_list) > 0: logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") @@ -632,24 +563,31 @@ def main_stress_policy(args): logger.warning("!!! Deleting state file and stopping. Please run the command again to start from the beginning. !!!") logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if not args.dry_run and not args.disable_log_writing: - state_manager.close() - try: os.remove(state_manager.state_file_path) - except OSError as e: logger.error(f"Failed to delete state file: {e}") - return 0 + state_manager.close() # ensure it's closed before deleting + try: + os.remove(state_manager.state_file_path) + logger.info(f"Deleted state file: {state_manager.state_file_path}") + except OSError as e: + logger.error(f"Failed to delete state file: {e}") + else: + logger.info("[Dry Run] Would have deleted state file and stopped.") + + return 0 # Stop execution. if start_index > 0: logger.info(f"Starting/resuming from URL index {start_index + 1}.") + # The worker's get_next_url_batch will respect this starting index. sp_utils.display_effective_policy(policy, policy_name, sources=urls_list) if args.dry_run: return 0 - with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor: - futures = [] - logger.info(f"Launching {total_workers} worker(s)...") - for config in worker_configs: - logger.info(f" - Worker {config['id']}: {config['pool_info']}") - futures.append(executor.submit(target_worker_func, config['id'], policy, state_manager, args, manager_for_worker, urls_list, running_processes, process_lock, profile_prefix=config['prefix'])) - + workers = exec_control.get('workers', 1) + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + futures = [ + executor.submit(run_direct_batch_worker, i, policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) + for i in range(workers) + ] + # Wait for all workers to complete. They will exit their loops when no URLs are left. concurrent.futures.wait(futures) if shutdown_event.is_set(): logger.info("Shutdown signal received, workers have finished.") @@ -670,77 +608,96 @@ def main_stress_policy(args): logger.error("Orchestration mode 'direct_docker_cli' is only compatible with 'fetch_only' or 'download_only' modes and 'from_pool_with_lock' profile mode.") return 1 - if not manager_for_worker: - logger.error(f"Direct docker CLI mode requires a profile manager, but it was not configured.") + direct_policy = policy.get('direct_docker_cli_policy', {}) + use_env = direct_policy.get('use_profile_env', 'auth' if mode == 'fetch_only' else 'download') + + profile_manager_instance = profile_managers.get(use_env) + if not profile_manager_instance: + logger.error(f"Direct docker CLI mode requires a '{use_env}' profile manager, but it was not configured.") return 1 + workers = exec_control.get('workers', 1) + if mode == 'fetch_only': - queue_policy = policy.get('queue_policy') - if not queue_policy: - urls_file = settings.get('urls_file') - if not urls_file: - logger.error("Direct docker CLI (fetch) mode requires 'settings.urls_file' if not configured for queue operation.") - return 1 - try: - with open(urls_file, 'r', encoding='utf-8') as f: - urls_list = [line.strip() for line in f if line.strip()] - except IOError as e: - logger.error(f"Could not read urls_file '{urls_file}': {e}") - return 1 - if not urls_list: - logger.error(f"URL file '{urls_file}' is empty. Nothing to do.") - return 1 - start_index = state_manager.get_last_url_index() - if args.start_from_url_index is not None: - start_index = max(0, args.start_from_url_index - 1) - state_manager.update_last_url_index(start_index, force=True) - if start_index >= len(urls_list) and len(urls_list) > 0: - logger.warning("ALL URLS HAVE BEEN PROCESSED. Reset state file to run again.") - return 0 - if start_index > 0: - logger.info(f"Starting/resuming from URL index {start_index + 1}.") - else: - # Queue mode setup - # ... (omitted for brevity, assuming file mode for this fix) - pass - elif mode == 'download_only': - # ... (omitted for brevity, assuming fetch mode for this fix) - pass - - sp_utils.display_effective_policy(policy, policy_name, sources=urls_list) - if args.dry_run: return 0 - - with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor: - futures = [] - logger.info(f"Launching {total_workers} worker(s)...") - for config in worker_configs: - logger.info(f" - Worker {config['id']}: {config['pool_info']}") - if mode == 'fetch_only': - futures.append(executor.submit( - target_worker_func, config['id'], policy, state_manager, args, - manager_for_worker, urls_list, running_processes, process_lock, - profile_prefix=config['prefix'] - )) - elif mode == 'download_only': - futures.append(executor.submit( - target_worker_func, config['id'], policy, state_manager, args, - manager_for_worker, running_processes, process_lock, - profile_prefix=config['prefix'] - )) - else: - logger.error(f"Unsupported mode '{mode}' for 'direct_docker_cli' orchestration.") - shutdown_event.set() - break - - if shutdown_event.is_set(): - pass # An error occurred, just exit - elif mode == 'fetch_only' and not policy.get('queue_policy'): - concurrent.futures.wait(futures) - else: # download_only or queue mode runs until shutdown - shutdown_event.wait() + urls_file = settings.get('urls_file') + if not urls_file: + logger.error("Direct docker CLI (fetch) mode requires 'settings.urls_file'.") + return 1 - if shutdown_event.is_set(): - logger.info("Shutdown signal received, workers have finished.") + try: + with open(urls_file, 'r', encoding='utf-8') as f: + urls_list = [line.strip() for line in f if line.strip()] + except IOError as e: + logger.error(f"Could not read urls_file '{urls_file}': {e}") + return 1 + + if not urls_list: + logger.error(f"URL file '{urls_file}' is empty. Nothing to do.") + return 1 + + start_index = 0 + if args.start_from_url_index is not None: + start_index = max(0, args.start_from_url_index - 1) + state_manager.update_last_url_index(start_index, force=True) + else: + start_index = state_manager.get_last_url_index() + + if start_index >= len(urls_list) and len(urls_list) > 0: + logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + logger.warning("!!! ALL URLS HAVE BEEN PROCESSED IN PREVIOUS RUNS (based on state file) !!!") + logger.warning(f"!!! State file indicates start index {start_index + 1}, but URL file has only {len(urls_list)} URLs. !!!") + logger.warning("!!! Deleting state file and stopping. Please run the command again to start from the beginning. !!!") + logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + if not args.dry_run and not args.disable_log_writing: + state_manager.close() + try: + os.remove(state_manager.state_file_path) + logger.info(f"Deleted state file: {state_manager.state_file_path}") + except OSError as e: + logger.error(f"Failed to delete state file: {e}") + else: + logger.info("[Dry Run] Would have deleted state file and stopped.") + return 0 + + if start_index > 0: + logger.info(f"Starting/resuming from URL index {start_index + 1}.") + + sp_utils.display_effective_policy(policy, policy_name, sources=urls_list) + if args.dry_run: return 0 + + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + futures = [ + executor.submit(run_direct_docker_worker, i, policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) + for i in range(workers) + ] + # This worker runs until shutdown, like the download worker + shutdown_event.wait() + logger.info("Shutdown signal received, waiting for direct docker workers to finish...") + concurrent.futures.wait(futures) + + elif mode == 'download_only': + info_json_dir = settings.get('info_json_dir') + if not info_json_dir: + logger.error("Direct docker CLI (download) mode requires 'settings.info_json_dir'.") + return 1 + try: + os.makedirs(info_json_dir, exist_ok=True) + except OSError as e: + logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}") + return 1 + + sp_utils.display_effective_policy(policy, policy_name, sources=[]) + if args.dry_run: return 0 + + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + futures = [ + executor.submit(run_direct_docker_download_worker, i, policy, state_manager, args, profile_manager_instance, running_processes, process_lock) + for i in range(workers) + ] + # This worker runs until shutdown + shutdown_event.wait() + logger.info("Shutdown signal received, waiting for direct docker download workers to finish...") + concurrent.futures.wait(futures) state_manager.print_summary(policy) state_manager.close() @@ -753,7 +710,8 @@ def main_stress_policy(args): logger.error("Orchestration mode 'direct_download_cli' is only compatible with 'download_only' mode and 'from_pool_with_lock' profile mode.") return 1 - if not manager_for_worker: + download_manager = profile_managers.get('download') + if not download_manager: logger.error("Direct download CLI mode requires a download profile manager.") return 1 @@ -761,6 +719,7 @@ def main_stress_policy(args): if not info_json_dir: logger.error("Direct download CLI mode requires 'settings.info_json_dir'.") return 1 + try: os.makedirs(info_json_dir, exist_ok=True) except OSError as e: @@ -770,13 +729,12 @@ def main_stress_policy(args): sp_utils.display_effective_policy(policy, policy_name, sources=[]) if args.dry_run: return 0 - with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor: - futures = [] - logger.info(f"Launching {total_workers} worker(s)...") - for config in worker_configs: - logger.info(f" - Worker {config['id']}: {config['pool_info']}") - futures.append(executor.submit(target_worker_func, config['id'], policy, state_manager, args, manager_for_worker, running_processes, process_lock, profile_prefix=config['prefix'])) - + workers = exec_control.get('workers', 1) + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + futures = [ + executor.submit(run_direct_download_worker, i, policy, state_manager, args, download_manager, running_processes, process_lock) + for i in range(workers) + ] shutdown_event.wait() logger.info("Shutdown signal received, waiting for direct download workers to finish...") concurrent.futures.wait(futures) @@ -785,18 +743,650 @@ def main_stress_policy(args): state_manager.close() return 0 - # --- Queue-based Orchestration Modes --- - elif orchestration_mode in ['queue_auth', 'queue_download', 'queue_full_stack']: - # This logic is complex and separate. For now, we assume it doesn't use worker_pools yet. - # If it needs to, it will require similar changes. - # ... (existing queue logic) - logger.error(f"Orchestration mode '{orchestration_mode}' is not fully covered by the new worker logic yet.") - return 1 + # --- Queue Auth Orchestration Mode --- + elif orchestration_mode == 'queue_auth': + logger.info("--- Queue Auth Orchestration Mode Enabled ---") + if mode != 'fetch_only' or settings.get('profile_mode') != 'from_pool_with_lock': + logger.error("Orchestration mode 'queue_auth' is only compatible with 'fetch_only' mode and 'from_pool_with_lock' profile mode.") + return 1 + auth_manager = profile_managers.get('auth') + if not auth_manager: + logger.error("Queue auth mode requires an auth profile manager.") + return 1 + + # Initialize queue provider + queue_policy = policy.get('queue_policy', {}) + redis_host = args.redis_host or os.getenv('REDIS_HOST') or queue_policy.get('redis_host') or 'localhost' + redis_port = args.redis_port if args.redis_port is not None else (int(os.getenv('REDIS_PORT')) if os.getenv('REDIS_PORT') else (queue_policy.get('redis_port') or 6379)) + redis_password = args.redis_password or os.getenv('REDIS_PASSWORD') or queue_policy.get('redis_password') + redis_db = args.redis_db if args.redis_db is not None else (int(os.getenv('REDIS_DB')) if os.getenv('REDIS_DB') else (queue_policy.get('redis_db') or 0)) + + # Extract env from manager's key prefix, unless disabled by policy + use_env_prefix = queue_policy.get('use_env_prefix', True) + env_prefix = None + if use_env_prefix: + env_prefix = auth_manager.key_prefix.removesuffix('_profile_mgmt_') + + state_manager.initialize_queue_provider( + redis_host=redis_host, + redis_port=redis_port, + redis_password=redis_password, + redis_db=redis_db, + env_prefix=env_prefix + ) + + # Create save directory if specified + save_dir = settings.get('save_info_json_dir') + if save_dir: + try: + os.makedirs(save_dir, exist_ok=True) + logger.info(f"Created save directory for info.json files: {save_dir}") + except OSError as e: + logger.error(f"Failed to create save directory '{save_dir}': {e}") + return 1 + + # Requeue failed tasks if requested + if args.requeue_failed: + requeued = state_manager.requeue_failed_auth_tasks( + batch_size=queue_policy.get('requeue_batch_size', 100) + ) + logger.info(f"Requeued {requeued} failed authentication tasks.") + + sp_utils.display_effective_policy(policy, policy_name, sources=[]) + if args.dry_run: return 0 + + workers = exec_control.get('workers', 1) + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + futures = [ + executor.submit(run_queue_auth_worker, i, policy, state_manager, args, auth_manager, running_processes, process_lock) + for i in range(workers) + ] + shutdown_event.wait() + logger.info("Shutdown signal received, waiting for queue auth workers to finish...") + concurrent.futures.wait(futures) + + state_manager.print_summary(policy) + state_manager.close() + return 0 + + # --- Queue Download Orchestration Mode --- + elif orchestration_mode == 'queue_download': + logger.info("--- Queue Download Orchestration Mode Enabled ---") + if mode != 'download_only' or settings.get('profile_mode') != 'from_pool_with_lock': + logger.error("Orchestration mode 'queue_download' is only compatible with 'download_only' mode and 'from_pool_with_lock' profile mode.") + return 1 + + download_manager = profile_managers.get('download') + if not download_manager: + logger.error("Queue download mode requires a download profile manager.") + return 1 + + # Initialize queue provider + queue_policy = policy.get('queue_policy', {}) + redis_host = args.redis_host or os.getenv('REDIS_HOST') or queue_policy.get('redis_host') or 'localhost' + redis_port = args.redis_port if args.redis_port is not None else (int(os.getenv('REDIS_PORT')) if os.getenv('REDIS_PORT') else (queue_policy.get('redis_port') or 6379)) + redis_password = args.redis_password or os.getenv('REDIS_PASSWORD') or queue_policy.get('redis_password') + redis_db = args.redis_db if args.redis_db is not None else (int(os.getenv('REDIS_DB')) if os.getenv('REDIS_DB') else (queue_policy.get('redis_db') or 0)) + + # Extract env from manager's key prefix, unless disabled by policy + use_env_prefix = queue_policy.get('use_env_prefix', True) + env_prefix = None + if use_env_prefix: + env_prefix = download_manager.key_prefix.removesuffix('_profile_mgmt_') + + state_manager.initialize_queue_provider( + redis_host=redis_host, + redis_port=redis_port, + redis_password=redis_password, + redis_db=redis_db, + env_prefix=env_prefix + ) + + # Create output directory if specified + output_dir = d_policy.get('output_dir') + if output_dir: + try: + os.makedirs(output_dir, exist_ok=True) + logger.info(f"Created output directory for downloads: {output_dir}") + except OSError as e: + logger.error(f"Failed to create output directory '{output_dir}': {e}") + return 1 + + # Requeue failed tasks if requested + if args.requeue_failed: + requeued = state_manager.requeue_failed_download_tasks( + batch_size=queue_policy.get('requeue_batch_size', 100) + ) + logger.info(f"Requeued {requeued} failed download tasks.") + + sp_utils.display_effective_policy(policy, policy_name, sources=[]) + if args.dry_run: return 0 + + workers = exec_control.get('workers', 1) + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + futures = [ + executor.submit(run_queue_download_worker, i, policy, state_manager, args, download_manager, running_processes, process_lock) + for i in range(workers) + ] + shutdown_event.wait() + logger.info("Shutdown signal received, waiting for queue download workers to finish...") + concurrent.futures.wait(futures) + + state_manager.print_summary(policy) + state_manager.close() + return 0 + + # --- Queue Full Stack Orchestration Mode --- + elif orchestration_mode == 'queue_full_stack': + logger.info("--- Queue Full Stack Orchestration Mode Enabled ---") + if mode != 'full_stack' or settings.get('profile_mode') != 'from_pool_with_lock': + logger.error("Orchestration mode 'queue_full_stack' is only compatible with 'full_stack' mode and 'from_pool_with_lock' profile mode.") + return 1 + + auth_manager = profile_managers.get('auth') + if not auth_manager: + logger.error("Queue full stack mode requires an auth profile manager.") + return 1 + + download_manager = profile_managers.get('download') + if not download_manager: + logger.error("Queue full stack mode requires a download profile manager.") + return 1 + + # Initialize queue provider + queue_policy = policy.get('queue_policy', {}) + redis_host = args.redis_host or os.getenv('REDIS_HOST') or queue_policy.get('redis_host') or 'localhost' + redis_port = args.redis_port if args.redis_port is not None else (int(os.getenv('REDIS_PORT')) if os.getenv('REDIS_PORT') else (queue_policy.get('redis_port') or 6379)) + redis_password = args.redis_password or os.getenv('REDIS_PASSWORD') or queue_policy.get('redis_password') + redis_db = args.redis_db if args.redis_db is not None else (int(os.getenv('REDIS_DB')) if os.getenv('REDIS_DB') else (queue_policy.get('redis_db') or 0)) + + # Extract env from auth manager's key prefix, unless disabled by policy + use_env_prefix = queue_policy.get('use_env_prefix', True) + env_prefix = None + if use_env_prefix: + auth_prefix = auth_manager.key_prefix.removesuffix('_profile_mgmt_') + download_prefix = download_manager.key_prefix.removesuffix('_profile_mgmt_') + if auth_prefix != download_prefix: + logger.warning(f"Auth environment ('{auth_prefix}') and Download environment ('{download_prefix}') are different.") + logger.warning(f"Using '{auth_prefix}' as the prefix for all shared Redis queues.") + env_prefix = auth_prefix + + state_manager.initialize_queue_provider( + redis_host=redis_host, + redis_port=redis_port, + redis_password=redis_password, + redis_db=redis_db, + env_prefix=env_prefix + ) + + # Create directories if specified + save_dir = settings.get('save_info_json_dir') + if save_dir: + try: + os.makedirs(save_dir, exist_ok=True) + logger.info(f"Created save directory for info.json files: {save_dir}") + except OSError as e: + logger.error(f"Failed to create save directory '{save_dir}': {e}") + return 1 + + output_dir = d_policy.get('output_dir') + if output_dir: + try: + os.makedirs(output_dir, exist_ok=True) + logger.info(f"Created output directory for downloads: {output_dir}") + except OSError as e: + logger.error(f"Failed to create output directory '{output_dir}': {e}") + return 1 + + # Requeue failed tasks if requested + if args.requeue_failed: + requeued_auth = state_manager.requeue_failed_auth_tasks( + batch_size=queue_policy.get('requeue_batch_size', 100) + ) + requeued_dl = state_manager.requeue_failed_download_tasks( + batch_size=queue_policy.get('requeue_batch_size', 100) + ) + logger.info(f"Requeued {requeued_auth} failed authentication tasks and {requeued_dl} failed download tasks.") + + sp_utils.display_effective_policy(policy, policy_name, sources=[]) + if args.dry_run: return 0 + + # Start both auth and download workers + auth_workers = exec_control.get('auth_workers', 1) + download_workers = exec_control.get('download_workers', 2) + + with concurrent.futures.ThreadPoolExecutor(max_workers=auth_workers + download_workers) as executor: + # Start auth workers + auth_futures = [ + executor.submit(run_queue_auth_worker, i, policy, state_manager, args, auth_manager, running_processes, process_lock) + for i in range(auth_workers) + ] + + # Start download workers + dl_futures = [ + executor.submit(run_queue_download_worker, i + auth_workers, policy, state_manager, args, download_manager, running_processes, process_lock) + for i in range(download_workers) + ] + + # Start requeue task if configured + requeue_interval = queue_policy.get('requeue_interval_seconds') + requeue_enabled = queue_policy.get('requeue_failed_tasks', False) + + if requeue_enabled and requeue_interval: + def requeue_task(): + while not shutdown_event.is_set(): + time.sleep(requeue_interval) + if shutdown_event.is_set(): + break + + try: + requeued_auth = state_manager.requeue_failed_auth_tasks( + batch_size=queue_policy.get('requeue_batch_size', 100) + ) + requeued_dl = state_manager.requeue_failed_download_tasks( + batch_size=queue_policy.get('requeue_batch_size', 100) + ) + + if requeued_auth > 0 or requeued_dl > 0: + logger.info(f"Auto-requeued {requeued_auth} failed auth tasks and {requeued_dl} failed download tasks.") + except Exception as e: + logger.error(f"Error in auto-requeue task: {e}") + + requeue_future = executor.submit(requeue_task) + all_futures = auth_futures + dl_futures + [requeue_future] + else: + all_futures = auth_futures + dl_futures + + # Wait for shutdown signal + shutdown_event.wait() + logger.info("Shutdown signal received, waiting for queue workers to finish...") + concurrent.futures.wait(all_futures) + + state_manager.print_summary(policy) + state_manager.close() + return 0 + # --- Default (Task-First) Orchestration Mode --- - # ... (existing task-first logic) - logger.error(f"Orchestration mode 'task-first' (default) is not fully covered by the new worker logic yet.") - return 1 + sources = [] # This will be a list of URLs or Path objects + if mode in ['full_stack', 'fetch_only']: + urls_file = settings.get('urls_file') + if not urls_file: + logger.error("Policy mode requires 'settings.urls_file'.") + return 1 + try: + with open(urls_file, 'r', encoding='utf-8') as f: + content = f.read() + try: + data = json.loads(content) + if isinstance(data, list) and all(isinstance(item, str) for item in data): + sources = data + logger.info(f"Loaded {len(sources)} URLs/IDs from JSON array in {urls_file}.") + else: + logger.error(f"URL file '{urls_file}' is valid JSON but not an array of strings.") + return 1 + except json.JSONDecodeError: + sources = [line.strip() for line in content.splitlines() if line.strip()] + logger.info(f"Loaded {len(sources)} URLs/IDs from text file {urls_file}.") + except IOError as e: + logger.error(f"Failed to read urls_file {urls_file}: {e}") + return 1 + + # Clean up URLs/IDs which might have extra quotes, commas, or brackets from copy-pasting + cleaned_sources = [] + for source in sources: + cleaned_source = source.strip().rstrip(',').strip().strip('\'"[]').strip() + if cleaned_source: + cleaned_sources.append(cleaned_source) + + if len(cleaned_sources) != len(sources): + logger.info(f"Cleaned URL list, removed {len(sources) - len(cleaned_sources)} empty or invalid entries.") + + sources = cleaned_sources + elif mode == 'download_only': + # If not in continuous mode, load sources once at the start. + # In continuous mode, `sources` is populated at the start of each cycle. + if settings.get('directory_scan_mode') != 'continuous': + info_json_dir = settings.get('info_json_dir') + if not info_json_dir: + logger.error("Policy mode 'download_only' requires 'settings.info_json_dir'.") + return 1 + try: + all_files = sorted(Path(info_json_dir).glob('*.json')) + sample_percent = settings.get('info_json_dir_sample_percent') + if sample_percent and 0 < sample_percent <= 100: + sample_count = int(len(all_files) * (sample_percent / 100.0)) + num_to_sample = min(len(all_files), max(1, sample_count)) + sources = random.sample(all_files, k=num_to_sample) + logger.info(f"Randomly sampled {len(sources)} files ({sample_percent}%) from {info_json_dir}") + else: + sources = all_files + except (IOError, FileNotFoundError) as e: + logger.error(f"Failed to read info_json_dir {info_json_dir}: {e}") + return 1 + + # In continuous download mode, sources are loaded inside the loop, so we skip this check. + if settings.get('directory_scan_mode') != 'continuous' and not sources: + logger.error("No sources (URLs or info.json files) to process. Exiting.") + return 1 + + start_index = 0 + if mode in ['full_stack', 'fetch_only']: + if args.start_from_url_index is not None: + # User provided a 1-based index via CLI + start_index = max(0, args.start_from_url_index - 1) + logger.info(f"Starting from URL index {start_index + 1} as requested by --start-from-url-index.") + # When user specifies it, we should overwrite the saved state. + state_manager.update_last_url_index(start_index, force=True) + else: + start_index = state_manager.get_last_url_index() + if start_index > 0: + logger.info(f"Resuming from URL index {start_index + 1} based on saved state.") + + if start_index >= len(sources): + logger.warning(f"Start index ({start_index + 1}) is beyond the end of the URL list ({len(sources)}). Nothing to process.") + sources = [] + + # --- Auto-calculate workers if needed --- + original_workers_setting = exec_control.get('workers') + if original_workers_setting == 'auto': + # In this simplified model, 'auto' is based on target rate, not profiles. + target_rate_cfg = exec_control.get('target_rate', {}) + target_reqs = target_rate_cfg.get('requests') + target_mins = target_rate_cfg.get('per_minutes') + if target_reqs and target_mins and sources: + target_rpm = target_reqs / target_mins + num_sources = len(sources) + sleep_cfg = exec_control.get('sleep_between_tasks', {}) + avg_sleep = (sleep_cfg.get('min_seconds', 0) + sleep_cfg.get('max_seconds', 0)) / 2 + assumed_task_duration = 12 # Must match assumption in display_effective_policy + + # Formula: workers = (total_work_seconds) / (total_time_for_work) + # total_time_for_work is derived from the target rate: + # (total_cycle_time) = (60 * num_sources) / target_rpm + # total_time_for_work = total_cycle_time - avg_sleep + work_time_available = (60 * num_sources / target_rpm) - avg_sleep + + if work_time_available <= 0: + # The sleep time alone makes the target rate impossible. + # Set workers to max parallelism as a best-effort. + num_workers = num_sources + logger.warning(f"Target rate of {target_rpm} req/min is likely unachievable due to sleep time of {avg_sleep}s.") + logger.warning(f"Setting workers to max parallelism ({num_workers}) as a best effort.") + else: + total_work_seconds = num_sources * assumed_task_duration + num_workers = total_work_seconds / work_time_available + + calculated_workers = max(1, int(num_workers + 0.99)) # Ceiling + exec_control['workers'] = calculated_workers + logger.info(f"Calculated 'auto' workers based on target rate: {calculated_workers}") + else: + logger.warning("Cannot calculate 'auto' workers: 'target_rate' or sources are not defined. Defaulting to 1 worker.") + exec_control['workers'] = 1 + + sp_utils.display_effective_policy( + policy, + policy_name, + sources=sources, + profile_names=None, # Profile grouping is removed + original_workers_setting=original_workers_setting + ) + + if args.dry_run: + logger.info("Dry run complete. Exiting.") + return 0 + + start_time = time.time() + + run_until_cfg = exec_control.get('run_until', {}) + duration_seconds = (run_until_cfg.get('minutes') or 0) * 60 + max_cycles = run_until_cfg.get('cycles') or 0 + max_requests = run_until_cfg.get('requests') or 0 + + # --- Main test loop --- + cycles = 0 + try: + while not shutdown_event.is_set(): + if duration_seconds and (time.time() - start_time) > duration_seconds: + logger.info("Reached duration limit. Stopping.") + break + if max_requests > 0 and state_manager.get_request_count() >= max_requests: + logger.info(f"Reached max requests ({max_requests}). Stopping.") + break + + # --- Rescan for sources if in continuous download mode --- + if mode == 'download_only' and settings.get('directory_scan_mode') == 'continuous': + info_json_dir = settings.get('info_json_dir') + try: + all_files_in_dir = Path(info_json_dir).glob('*.json') + processed_files = state_manager.get_processed_files() + + new_files = [f for f in all_files_in_dir if str(f) not in processed_files] + + # Sort by modification time, oldest first, to process in order of creation + new_files.sort(key=os.path.getmtime) + + max_files_per_cycle = settings.get('max_files_per_cycle') + if max_files_per_cycle and len(new_files) > max_files_per_cycle: + sources = new_files[:max_files_per_cycle] + else: + sources = new_files + + if not sources: + sleep_duration = settings.get('sleep_if_no_new_files_seconds', 10) + logger.info(f"No new info.json files found in '{info_json_dir}'. Sleeping for {sleep_duration}s...") + + # Interruptible sleep + sleep_end_time = time.time() + sleep_duration + while time.time() < sleep_end_time: + if shutdown_event.is_set(): + break + time.sleep(0.5) + + if shutdown_event.is_set(): + break + continue # Skip to next iteration of the while loop + + except (IOError, FileNotFoundError) as e: + logger.error(f"Failed to read info_json_dir {info_json_dir}: {e}. Retrying in 10s.") + time.sleep(10) + continue + + # --- Group sources for this cycle --- + task_items = sources + + # If there's nothing to do this cycle, skip. + if not task_items: + if mode == 'download_only' and settings.get('directory_scan_mode') == 'continuous': + # The sleep logic is handled inside the rescanning block. + continue + else: + logger.info("No more sources to process. Ending test.") + break + + cycles += 1 + if max_cycles > 0 and cycles > max_cycles: + logger.info(f"Reached max cycles ({max_cycles}). Stopping.") + break + + logger.info(f"--- Cycle #{cycles} (Total Requests: {state_manager.get_request_count()}) ---") + + with concurrent.futures.ThreadPoolExecutor(max_workers=exec_control.get('workers', 1)) as executor: + # Submit one task per source URL or info.json file + future_to_task_info = { + executor.submit(process_task, source, i, cycles, policy, state_manager, args, profile_managers, running_processes, process_lock): { + 'source': source, + 'abs_index': i + } + for i, source in enumerate(task_items) if i >= start_index + } + + should_stop = False + pending_futures = set(future_to_task_info.keys()) + + while pending_futures and not should_stop: + done, pending_futures = concurrent.futures.wait( + pending_futures, return_when=concurrent.futures.FIRST_COMPLETED + ) + + for future in done: + if shutdown_event.is_set(): + should_stop = True + break + + task_info = future_to_task_info[future] + source = task_info['source'] + abs_index = task_info.get('abs_index') + + try: + results = future.result() + + if abs_index is not None and mode in ['full_stack', 'fetch_only']: + # Update state to resume from the *next* URL. + state_manager.update_last_url_index(abs_index + 1) + + # --- Mark file as processed --- + # This is the central place to mark a source as complete for download_only mode. + if mode == 'download_only': + # In continuous mode, we add to state file to prevent re-picking in same run. + if settings.get('directory_scan_mode') == 'continuous': + state_manager.mark_file_as_processed(source) + # If marking by rename is on, do that. + if settings.get('mark_processed_files'): + try: + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + new_path = source.parent / f"{source.name}.{timestamp}.processed" + source.rename(new_path) + logger.info(f"Marked '{source.name}' as processed by renaming to '{new_path.name}'") + except (IOError, OSError) as e: + logger.error(f"Failed to rename processed file '{source.name}': {e}") + + # When using profile-aware mode, the file processing (including marking as + # processed) is handled inside process_profile_task. + # For non-profile mode, this logic was incorrect and has been moved. + + for result in results: + if not result['success']: + s_conditions = policy.get('stop_conditions', {}) + is_cumulative_403_active = s_conditions.get('on_cumulative_403', {}).get('max_errors') + if s_conditions.get('on_failure') or \ + (s_conditions.get('on_http_403') and not is_cumulative_403_active and result['error_type'] == 'HTTP 403') or \ + (s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'): + logger.info(f"!!! STOP CONDITION MET: Immediate stop on failure '{result['error_type']}' for {sp_utils.get_display_name(source)}. Shutting down all workers. !!!") + should_stop = True + break + except concurrent.futures.CancelledError: + logger.info(f"Task for {sp_utils.get_display_name(source)} was cancelled during shutdown.") + event = { + 'type': 'fetch' if mode != 'download_only' else 'download', + 'path': str(source), + 'success': False, + 'error_type': 'Cancelled', + 'details': 'Task cancelled during shutdown.' + } + state_manager.log_event(event) + except Exception as exc: + logger.error(f'{sp_utils.get_display_name(source)} generated an exception: {exc}') + + if should_stop: + break + + # Check for all stop conditions after each task completes. + + # 1. Max requests limit + if not should_stop and max_requests > 0 and state_manager.get_request_count() >= max_requests: + logger.info(f"!!! STOP CONDITION MET: Reached request limit ({max_requests}). Shutting down. !!!") + should_stop = True + + # 2. Duration limit + if not should_stop and duration_seconds and (time.time() - start_time) > duration_seconds: + logger.info(f"!!! STOP CONDITION MET: Reached duration limit ({run_until_cfg.get('minutes')} minutes). Shutting down. !!!") + should_stop = True + + # 3. Cumulative error rate limits + s_conditions = policy.get('stop_conditions', {}) + error_rate_policy = s_conditions.get('on_error_rate') + if not should_stop and error_rate_policy: + max_errors = error_rate_policy.get('max_errors') + per_minutes = error_rate_policy.get('per_minutes') + if max_errors and per_minutes: + error_count = state_manager.check_cumulative_error_rate(max_errors, per_minutes) + if error_count > 0: + logger.info(f"!!! STOP CONDITION MET: Error rate exceeded ({error_count} errors in last {per_minutes}m). Shutting down. !!!") + should_stop = True + + cumulative_403_policy = s_conditions.get('on_cumulative_403') + if not should_stop and cumulative_403_policy: + max_errors = cumulative_403_policy.get('max_errors') + per_minutes = cumulative_403_policy.get('per_minutes') + if max_errors and per_minutes: + error_count = state_manager.check_cumulative_error_rate(max_errors, per_minutes, error_type='HTTP 403') + if error_count > 0: + logger.info(f"!!! STOP CONDITION MET: Cumulative 403 rate exceeded ({error_count} in last {per_minutes}m). Shutting down. !!!") + should_stop = True + + quality_degradation_policy = s_conditions.get('on_quality_degradation') + if not should_stop and quality_degradation_policy: + max_triggers = quality_degradation_policy.get('max_triggers') + per_minutes = quality_degradation_policy.get('per_minutes') + if max_triggers and per_minutes: + trigger_count = state_manager.check_quality_degradation_rate(max_triggers, per_minutes) + if trigger_count > 0: + logger.info(f"!!! STOP CONDITION MET: Quality degradation triggered {trigger_count} times in last {per_minutes}m. Shutting down. !!!") + should_stop = True + + if should_stop: + break + + if should_stop and pending_futures: + logger.info(f"Cancelling {len(pending_futures)} outstanding task(s).") + for future in pending_futures: + future.cancel() + + if should_stop: break + + if max_cycles > 0 and cycles >= max_cycles: + break + + # If the run is not time-based (i.e., it's limited by cycles or requests) + # and it's not a continuous directory scan, we should stop after one pass. + # This makes the behavior of --set run_until.requests=N more intuitive: it acts + # as an upper limit for a single pass, not a trigger for multiple passes. + if settings.get('directory_scan_mode') != 'continuous' and not duration_seconds: + logger.info("Run is not time-based. Halting after one full pass through sources.") + break + + logger.info("Cycle complete.") + + except KeyboardInterrupt: + logger.info("\nForceful shutdown requested...") + finally: + # --- Graceful Shutdown URL Reporting --- + if shutdown_event.is_set(): + orchestration_mode = settings.get('orchestration_mode') + if orchestration_mode in ['direct_batch_cli', 'direct_docker_cli'] and mode == 'fetch_only': + urls_file = settings.get('urls_file') + # Check if urls_list was loaded for the relevant mode + if urls_file and 'urls_list' in locals() and urls_list: + last_index = state_manager.get_last_url_index() + # The index points to the *next* URL to be processed. + # If a batch was aborted, it might have been rewound. + # We should save all URLs from this index onwards. + if last_index < len(urls_list): + unprocessed_urls = urls_list[last_index:] + unprocessed_filename = f"unprocessed_urls_{policy_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt" + try: + with open(unprocessed_filename, 'w', encoding='utf-8') as f: + f.write('\n'.join(unprocessed_urls)) + logger.warning(f"--- GRACEFUL SHUTDOWN ---") + logger.warning(f"Saved {len(unprocessed_urls)} unprocessed URLs to '{unprocessed_filename}'.") + logger.warning(f"Last processed URL index was {last_index}. Next run should start from index {last_index + 1}.") + logger.warning(f"-------------------------") + except IOError as e: + logger.error(f"Could not save unprocessed URLs: {e}") + + state_manager.print_summary(policy) + state_manager.close() return 0