Remove unused dags
This commit is contained in:
parent
de609aaecd
commit
7b13257bd1
36
.aider.chat.history.md
Normal file
36
.aider.chat.history.md
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
|
||||||
|
# aider chat started at 2025-08-19 17:52:04
|
||||||
|
|
||||||
|
> Newer aider version v0.86.1 is available.
|
||||||
|
> /Users/aperez/.local/share/uv/tools/aider-chat/bin/python3 -m pip install --upgrade --upgrade-strategy only-if-needed aider-chat
|
||||||
|
> Run pip install? (Y)es/(N)o [Yes]: y
|
||||||
|
> Re-run aider to use new version.
|
||||||
|
|
||||||
|
# aider chat started at 2025-08-19 17:55:26
|
||||||
|
|
||||||
|
> /Users/aperez/.local/bin/aider --model gemini/gemini-2.5-pro-preview-03-25
|
||||||
|
> Aider v0.86.1
|
||||||
|
> Main model: gemini/gemini-2.5-pro-preview-03-25 with diff-fenced edit format
|
||||||
|
> Weak model: gemini/gemini-2.0-flash
|
||||||
|
> Git repo: .git with 34 files
|
||||||
|
> Repo-map: using 4096 tokens, auto refresh
|
||||||
|
> https://aider.chat/HISTORY.html#release-notes
|
||||||
|
> Please answer with one of: yes, no, skip, all
|
||||||
|
> Would you like to see what's new in this version? (Y)es/(N)o [Yes]: n
|
||||||
|
|
||||||
|
#### ls
|
||||||
|
>
|
||||||
|
>
|
||||||
|
> ^C again to exit
|
||||||
|
>
|
||||||
|
>
|
||||||
|
> ^C again to exit
|
||||||
|
|
||||||
|
#### /read-only ../yt-dlp-deployment/ansible/airflow_worker
|
||||||
|
> Added 51 files from directory /opt/yt-dlp-dags/../yt-dlp-deployment/ansible/airflow_worker to read-only files.
|
||||||
|
>
|
||||||
|
>
|
||||||
|
> ^C again to exit
|
||||||
|
>
|
||||||
|
>
|
||||||
|
> ^C KeyboardInterrupt
|
||||||
15
.aider.input.history
Normal file
15
.aider.input.history
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
|
||||||
|
# 2025-08-19 17:52:06.094899
|
||||||
|
+Y
|
||||||
|
|
||||||
|
# 2025-08-19 17:55:33.616531
|
||||||
|
+D
|
||||||
|
|
||||||
|
# 2025-08-19 17:55:35.382770
|
||||||
|
+No
|
||||||
|
|
||||||
|
# 2025-08-19 17:55:39.050939
|
||||||
|
+ls
|
||||||
|
|
||||||
|
# 2025-08-19 17:56:18.910148
|
||||||
|
+/read-only ../yt-dlp-deployment/ansible/airflow_worker
|
||||||
BIN
.aider.tags.cache.v4/cache.db
Normal file
BIN
.aider.tags.cache.v4/cache.db
Normal file
Binary file not shown.
23
airflow/.env
23
airflow/.env
@ -1,23 +0,0 @@
|
|||||||
AIRFLOW_IMAGE_NAME=apache/airflow:2.10.4
|
|
||||||
_AIRFLOW_WWW_USER_USERNAME=airflow
|
|
||||||
_AIRFLOW_WWW_USER_PASSWORD=airflow-password-ytld
|
|
||||||
AIRFLOW_UID=50000
|
|
||||||
AIRFLOW_PROJ_DIR=.
|
|
||||||
|
|
||||||
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow-new-super-pass@89.253.221.173:52919/airflow
|
|
||||||
AIRFLOW__CELERY__RESULT_BACKEND=db+postgresql://airflow:airflow-new-super-pass@89.253.221.173:52919/airflow
|
|
||||||
AIRFLOW__CELERY__BROKER_URL=redis://:rOhTAIlTFFylXsjhqwxnYxDChFc@89.253.221.173:52909/0
|
|
||||||
|
|
||||||
AIRFLOW_QUEUE=holisticlegs-download
|
|
||||||
AIRFLOW_QUEUE_CHECK=holisticlegs-check
|
|
||||||
AIRFLOW_QUEUE_UPLOAD=holisticlegs-upload
|
|
||||||
AIRFLOW__WEBSERVER__SECRET_KEY=8DJ6XbtIICassrVxM9jWV3eTlt5N3XtyEdyW
|
|
||||||
HOSTNAME=85.192.30.55
|
|
||||||
|
|
||||||
AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT=768M
|
|
||||||
AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV=522M
|
|
||||||
AIRFLOW_WORKER_DOWNLOAD_CONCURRENCY=2
|
|
||||||
|
|
||||||
AIRFLOW_SMALL_WORKERS_MEM_LIMIT=1024M
|
|
||||||
AIRFLOW_SMALL_WORKERS_MEM_RESERV=512M
|
|
||||||
~
|
|
||||||
@ -1,60 +0,0 @@
|
|||||||
# This file contains all environment variables for the Airflow-based deployment.
|
|
||||||
# Copy this file to .env in the same directory and fill in your production values.
|
|
||||||
# This file is used by `generate_envoy_config.py` and `docker-compose-ytdlp-ops.yaml`.
|
|
||||||
|
|
||||||
# --- Common Configuration ---
|
|
||||||
# A unique name for this server instance, used as a key in Redis.
|
|
||||||
# This is hardcoded in the docker-compose file but can be overridden here.
|
|
||||||
SERVER_IDENTITY=ytdlp-ops-airflow-service
|
|
||||||
|
|
||||||
# Redis connection details for proxy and account state management.
|
|
||||||
REDIS_HOST=redis
|
|
||||||
REDIS_PORT=6379
|
|
||||||
REDIS_PASSWORD=redis_pwd_K3fG8hJ1mN5pQ2sT
|
|
||||||
|
|
||||||
# --- Airflow Database Configuration ---
|
|
||||||
# The password for the PostgreSQL database used by Airflow.
|
|
||||||
# This should be a secure, randomly generated password.
|
|
||||||
POSTGRES_PASSWORD=pgdb_pwd_A7bC2xY9zE1wV5uP
|
|
||||||
|
|
||||||
# The password for the Airflow web UI admin user.
|
|
||||||
AIRFLOW_ADMIN_PASSWORD=admin_pwd_X9yZ3aB1cE5dF7gH
|
|
||||||
|
|
||||||
# --- Envoy & Worker Configuration ---
|
|
||||||
# The public-facing port for the Envoy load balancer that fronts the WORKERS.
|
|
||||||
ENVOY_PORT=9080
|
|
||||||
# The port for Envoy's admin/stats interface.
|
|
||||||
ENVOY_ADMIN_PORT=9901
|
|
||||||
# The public-facing port for the standalone MANAGEMENT service.
|
|
||||||
MANAGEMENT_SERVICE_PORT=9091
|
|
||||||
# The number of Python server workers to run.
|
|
||||||
# Set to 1 to simplify debugging. Multi-worker mode is experimental.
|
|
||||||
YTDLP_WORKERS=1
|
|
||||||
# The starting port for the Python workers. They will use sequential ports (e.g., 9090, 9091, ...).
|
|
||||||
YTDLP_BASE_PORT=9090
|
|
||||||
|
|
||||||
# --- Camoufox (Browser) Configuration ---
|
|
||||||
# Comma-separated list of SOCKS5 proxies to be used by Camoufox instances.
|
|
||||||
# Each proxy will get its own dedicated browser instance.
|
|
||||||
# Example: CAMOUFOX_PROXIES="socks5://user:pass@p.webshare.io:80,socks5://user:pass@p.webshare.io:81"
|
|
||||||
CAMOUFOX_PROXIES="socks5://your_proxy_user:your_proxy_pass@proxy.example.com:1080,socks5://your_proxy_user:your_proxy_pass@proxy.example.com:1081"
|
|
||||||
|
|
||||||
# Password for VNC access to the Camoufox browser instances.
|
|
||||||
VNC_PASSWORD=vnc_pwd_Z5xW8cV2bN4mP7lK
|
|
||||||
|
|
||||||
# The starting port for VNC access. Ports will be assigned sequentially (e.g., 5901, 5902, ...).
|
|
||||||
CAMOUFOX_BASE_VNC_PORT=5901
|
|
||||||
|
|
||||||
# The internal port used by Camoufox for its WebSocket server. Usually does not need to be changed.
|
|
||||||
CAMOUFOX_PORT=12345
|
|
||||||
|
|
||||||
# --- General Proxy Configuration ---
|
|
||||||
# A general-purpose SOCKS5 proxy that can be used alongside Camoufox proxies.
|
|
||||||
# This should be the IP address of the proxy server accessible from within the Docker network.
|
|
||||||
# '172.17.0.1' is often the host IP from within a container.
|
|
||||||
SOCKS5_SOCK_SERVER_IP=172.17.0.1
|
|
||||||
|
|
||||||
# --- Account Manager Configuration ---
|
|
||||||
# Account cooldown parameters (values are in minutes).
|
|
||||||
ACCOUNT_ACTIVE_DURATION_MIN=30
|
|
||||||
ACCOUNT_COOLDOWN_DURATION_MIN=60
|
|
||||||
File diff suppressed because it is too large
Load Diff
@ -1,736 +0,0 @@
|
|||||||
import sys
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import csv
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import requests
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
||||||
from typing import List, Optional, Dict, Callable, Union
|
|
||||||
from threading import Event
|
|
||||||
|
|
||||||
from PyQt6.QtCore import Qt, QThread, pyqtSignal, QObject, QTimer
|
|
||||||
from PyQt6.QtWidgets import (
|
|
||||||
QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
|
|
||||||
QLabel, QLineEdit, QPushButton, QTextEdit, QSpinBox, QDoubleSpinBox,
|
|
||||||
QCheckBox, QGroupBox, QGridLayout, QMessageBox, QProgressBar, QDialog,
|
|
||||||
QComboBox, QFileDialog
|
|
||||||
)
|
|
||||||
|
|
||||||
# Define the current version of this tool.
|
|
||||||
CURRENT_VERSION = "1.3.0"
|
|
||||||
|
|
||||||
class ProxyChecker:
|
|
||||||
"""
|
|
||||||
Fetches proxy lists from given URLs and checks if they work.
|
|
||||||
Supports cancellation, pause/resume, progress reporting, and collects optional detailed
|
|
||||||
response times, anonymity classification, and geo-location details for working proxies.
|
|
||||||
"""
|
|
||||||
def __init__(self,
|
|
||||||
proxy_urls: Dict[str, str],
|
|
||||||
timeout: int = 1,
|
|
||||||
max_retries: int = 3,
|
|
||||||
retry_delay: float = 1.0,
|
|
||||||
max_workers: int = 20,
|
|
||||||
check_url: str = "http://www.google.com",
|
|
||||||
detailed_results: bool = False,
|
|
||||||
export_format: str = "txt", # or "csv" or "json"
|
|
||||||
user_agent: Optional[str] = None,
|
|
||||||
log_callback: Optional[Callable[[str], None]] = None,
|
|
||||||
progress_callback: Optional[Callable[[int], None]] = None):
|
|
||||||
self.proxy_urls = proxy_urls
|
|
||||||
self.timeout = timeout
|
|
||||||
self.max_retries = max_retries
|
|
||||||
self.retry_delay = retry_delay
|
|
||||||
self.max_workers = max_workers
|
|
||||||
self.check_url = check_url
|
|
||||||
self.detailed_results = detailed_results
|
|
||||||
self.export_format = export_format.lower()
|
|
||||||
self.user_agent = user_agent
|
|
||||||
self.log_callback = log_callback
|
|
||||||
self.progress_callback = progress_callback
|
|
||||||
self.cancel_event = Event()
|
|
||||||
self.pause_event = Event() # When set, processing is paused
|
|
||||||
|
|
||||||
# Statistics counters
|
|
||||||
self.total_proxies_checked = 0
|
|
||||||
self.working_proxies_found = 0
|
|
||||||
self.overall_total_count = 0
|
|
||||||
self.overall_processed_count = 0
|
|
||||||
|
|
||||||
# Store detailed working results by type.
|
|
||||||
self.working_results: Dict[str, List[Union[str, Dict[str, Union[str, float, dict]]]]] = {}
|
|
||||||
|
|
||||||
self.session = requests.Session()
|
|
||||||
if self.user_agent:
|
|
||||||
self.session.headers["User-Agent"] = self.user_agent
|
|
||||||
|
|
||||||
# Determine the client IP to help with anonymity detection.
|
|
||||||
try:
|
|
||||||
r = requests.get("https://api.ipify.org?format=json", timeout=3)
|
|
||||||
r.raise_for_status()
|
|
||||||
self.client_ip = r.json().get("ip")
|
|
||||||
self.log("info", f"Client IP determined as {self.client_ip}")
|
|
||||||
except requests.RequestException:
|
|
||||||
self.client_ip = "unknown"
|
|
||||||
self.log("warning", "Could not determine client IP for anonymity detection.")
|
|
||||||
|
|
||||||
def log(self, level: str, message: str) -> None:
|
|
||||||
full_message = f"{level.upper()}: {message}"
|
|
||||||
if self.log_callback:
|
|
||||||
self.log_callback(full_message)
|
|
||||||
else:
|
|
||||||
print(full_message)
|
|
||||||
|
|
||||||
def cancel(self) -> None:
|
|
||||||
self.cancel_event.set()
|
|
||||||
self.log("info", "Cancellation requested.")
|
|
||||||
|
|
||||||
def pause(self) -> None:
|
|
||||||
self.pause_event.set()
|
|
||||||
self.log("info", "Proxy checking paused.")
|
|
||||||
|
|
||||||
def resume(self) -> None:
|
|
||||||
self.pause_event.clear()
|
|
||||||
self.log("info", "Proxy checking resumed.")
|
|
||||||
|
|
||||||
def determine_anonymity(self, proxy: str) -> str:
|
|
||||||
try:
|
|
||||||
session = requests.Session()
|
|
||||||
session.proxies = {'http': proxy, 'https': proxy}
|
|
||||||
r = session.get("https://api.ipify.org?format=json", timeout=self.timeout)
|
|
||||||
r.raise_for_status()
|
|
||||||
proxy_ip = r.json().get("ip")
|
|
||||||
return "transparent" if proxy_ip == self.client_ip else "anonymous"
|
|
||||||
except requests.RequestException:
|
|
||||||
return "unknown"
|
|
||||||
|
|
||||||
def get_geo_info(self, ip: str) -> dict:
|
|
||||||
try:
|
|
||||||
r = requests.get(f"http://ip-api.com/json/{ip}", timeout=3)
|
|
||||||
r.raise_for_status()
|
|
||||||
return r.json()
|
|
||||||
except requests.RequestException:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def check_proxy(self, proxy: str) -> Optional[Union[str, dict]]:
|
|
||||||
if self.cancel_event.is_set():
|
|
||||||
return None
|
|
||||||
# If paused, wait until resumed.
|
|
||||||
while self.pause_event.is_set():
|
|
||||||
time.sleep(0.1)
|
|
||||||
try:
|
|
||||||
start = time.time()
|
|
||||||
session = requests.Session()
|
|
||||||
session.proxies = {'http': proxy, 'https': proxy}
|
|
||||||
if self.user_agent:
|
|
||||||
session.headers["User-Agent"] = self.user_agent
|
|
||||||
response = session.get(self.check_url, timeout=self.timeout)
|
|
||||||
elapsed = time.time() - start
|
|
||||||
if response.status_code == 200:
|
|
||||||
if self.detailed_results:
|
|
||||||
anonymity = self.determine_anonymity(proxy)
|
|
||||||
ip_only = proxy.split(':')[0]
|
|
||||||
geo = self.get_geo_info(ip_only)
|
|
||||||
return {
|
|
||||||
"proxy": proxy,
|
|
||||||
"response_time": elapsed,
|
|
||||||
"anonymity": anonymity,
|
|
||||||
"geo": geo
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
return proxy
|
|
||||||
except requests.RequestException:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_proxies(self, url: str) -> List[str]:
|
|
||||||
for attempt in range(self.max_retries):
|
|
||||||
if self.cancel_event.is_set():
|
|
||||||
self.log("info", "Cancellation detected while fetching proxies.")
|
|
||||||
return []
|
|
||||||
try:
|
|
||||||
response = self.session.get(url, timeout=self.timeout)
|
|
||||||
response.raise_for_status()
|
|
||||||
self.log("info", f"Successfully fetched proxies from {url}")
|
|
||||||
return response.text.strip().splitlines()
|
|
||||||
except requests.RequestException as e:
|
|
||||||
self.log("warning", f"Attempt {attempt + 1} failed for {url}: {e}")
|
|
||||||
time.sleep(self.retry_delay)
|
|
||||||
self.log("error", f"Failed to retrieve proxies from {url} after {self.max_retries} attempts.")
|
|
||||||
return []
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def create_proxy_dir(directory: str) -> None:
|
|
||||||
os.makedirs(directory, exist_ok=True)
|
|
||||||
|
|
||||||
def process_proxies(self,
|
|
||||||
proxy_type: str,
|
|
||||||
url: Optional[str] = None,
|
|
||||||
proxies: Optional[List[str]] = None) -> int:
|
|
||||||
if proxies is None and url is not None:
|
|
||||||
proxies = self.get_proxies(url)
|
|
||||||
if self.cancel_event.is_set():
|
|
||||||
self.log("info", "Cancellation detected before processing proxies.")
|
|
||||||
return 0
|
|
||||||
if not proxies:
|
|
||||||
self.log("warning", f"No proxies to check for {proxy_type}")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
total_proxies = len(proxies)
|
|
||||||
self.log("info", f"Checking {total_proxies} {proxy_type} proxies with {self.max_workers} workers.")
|
|
||||||
working_proxy_list = []
|
|
||||||
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
||||||
futures = {executor.submit(self.check_proxy, proxy): proxy for proxy in proxies}
|
|
||||||
for future in as_completed(futures):
|
|
||||||
while self.pause_event.is_set():
|
|
||||||
time.sleep(0.1)
|
|
||||||
if self.cancel_event.is_set():
|
|
||||||
self.log("info", "Cancellation detected during proxy checking loop.")
|
|
||||||
break
|
|
||||||
result = future.result()
|
|
||||||
self.overall_processed_count += 1
|
|
||||||
if self.progress_callback and self.overall_total_count > 0:
|
|
||||||
progress_percent = int((self.overall_processed_count / self.overall_total_count) * 100)
|
|
||||||
self.progress_callback(progress_percent)
|
|
||||||
if result:
|
|
||||||
working_proxy_list.append(result)
|
|
||||||
|
|
||||||
self.working_results[proxy_type] = working_proxy_list
|
|
||||||
file_ext = ".csv" if self.export_format == "csv" else ".json" if self.export_format == "json" else ".txt"
|
|
||||||
proxy_file = f'proxies/{proxy_type}{file_ext}'
|
|
||||||
self.create_proxy_dir(os.path.dirname(proxy_file))
|
|
||||||
try:
|
|
||||||
if self.export_format == "csv":
|
|
||||||
with open(proxy_file, 'w', newline='') as f:
|
|
||||||
if self.detailed_results:
|
|
||||||
writer = csv.writer(f)
|
|
||||||
writer.writerow(["Proxy", "Response Time (s)", "Anonymity", "Country", "Region", "City"])
|
|
||||||
for item in working_proxy_list:
|
|
||||||
geo = item.get("geo", {})
|
|
||||||
writer.writerow([
|
|
||||||
item.get("proxy"),
|
|
||||||
f"{item.get('response_time', 0):.2f}",
|
|
||||||
item.get("anonymity"),
|
|
||||||
geo.get("country", ""),
|
|
||||||
geo.get("regionName", ""),
|
|
||||||
geo.get("city", "")
|
|
||||||
])
|
|
||||||
else:
|
|
||||||
writer = csv.writer(f)
|
|
||||||
writer.writerow(["Proxy"])
|
|
||||||
for item in working_proxy_list:
|
|
||||||
writer.writerow([item])
|
|
||||||
elif self.export_format == "json":
|
|
||||||
with open(proxy_file, 'w') as f:
|
|
||||||
json.dump(working_proxy_list, f, indent=4)
|
|
||||||
else:
|
|
||||||
with open(proxy_file, 'w') as f:
|
|
||||||
if self.detailed_results:
|
|
||||||
lines = [
|
|
||||||
f"{item.get('proxy')} - {item.get('response_time'):.2f} s - {item.get('anonymity')} - {item.get('geo', {}).get('country', '')}"
|
|
||||||
for item in working_proxy_list
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
lines = working_proxy_list
|
|
||||||
f.write('\n'.join(lines) + '\n')
|
|
||||||
except OSError as e:
|
|
||||||
self.log("error", f"Failed to write working proxies to {proxy_file}: {e}")
|
|
||||||
|
|
||||||
self.log("info", f"Checked {total_proxies} {proxy_type} proxies. Working: {len(working_proxy_list)}.")
|
|
||||||
self.total_proxies_checked += total_proxies
|
|
||||||
self.working_proxies_found += len(working_proxy_list)
|
|
||||||
return len(working_proxy_list)
|
|
||||||
|
|
||||||
def get_statistics(self) -> str:
|
|
||||||
stats = f"Total proxies checked: {self.total_proxies_checked}\n"
|
|
||||||
stats += f"Working proxies found: {self.working_proxies_found}\n"
|
|
||||||
if self.detailed_results:
|
|
||||||
all_times = []
|
|
||||||
for lst in self.working_results.values():
|
|
||||||
all_times.extend([item.get("response_time") for item in lst if isinstance(item, dict)])
|
|
||||||
if all_times:
|
|
||||||
avg_time = sum(all_times) / len(all_times)
|
|
||||||
stats += f"Average response time: {avg_time:.2f} seconds\n"
|
|
||||||
return stats
|
|
||||||
|
|
||||||
def run(self) -> None:
|
|
||||||
start_time = time.time()
|
|
||||||
self.overall_total_count = 0
|
|
||||||
self.overall_processed_count = 0
|
|
||||||
proxies_by_type: Dict[str, List[str]] = {}
|
|
||||||
|
|
||||||
for proxy_type, url in self.proxy_urls.items():
|
|
||||||
if self.cancel_event.is_set():
|
|
||||||
self.log("info", "Cancellation detected. Aborting processing.")
|
|
||||||
return
|
|
||||||
proxies = self.get_proxies(url)
|
|
||||||
proxies_by_type[proxy_type] = proxies
|
|
||||||
self.overall_total_count += len(proxies)
|
|
||||||
|
|
||||||
if self.overall_total_count == 0:
|
|
||||||
self.log("warning", "No proxies fetched from any source.")
|
|
||||||
|
|
||||||
for proxy_type, proxies in proxies_by_type.items():
|
|
||||||
if self.cancel_event.is_set():
|
|
||||||
self.log("info", "Cancellation detected. Aborting further processing.")
|
|
||||||
break
|
|
||||||
self.process_proxies(proxy_type, proxies=proxies)
|
|
||||||
|
|
||||||
self.session.close()
|
|
||||||
end_time = time.time()
|
|
||||||
minutes, seconds = divmod(end_time - start_time, 60)
|
|
||||||
self.log("info", f"Total proxies checked: {self.total_proxies_checked}. Working proxies: {self.working_proxies_found}.")
|
|
||||||
self.log("info", f"Execution time: {int(minutes)} minutes {int(seconds)} seconds.")
|
|
||||||
self.log("info", "Statistics:\n" + self.get_statistics())
|
|
||||||
# Append history log
|
|
||||||
try:
|
|
||||||
with open("history.log", "a") as hist_file:
|
|
||||||
hist_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {self.get_statistics()}\n")
|
|
||||||
except OSError as e:
|
|
||||||
self.log("error", f"Failed to write history log: {e}")
|
|
||||||
|
|
||||||
class ProxyCheckerWorker(QObject):
|
|
||||||
"""
|
|
||||||
Worker class to run the proxy checking process in a separate thread.
|
|
||||||
Emits log messages, progress updates, and a finished signal.
|
|
||||||
"""
|
|
||||||
log_signal = pyqtSignal(str)
|
|
||||||
progress_update = pyqtSignal(int)
|
|
||||||
finished = pyqtSignal()
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
proxy_urls: Dict[str, str],
|
|
||||||
timeout: int,
|
|
||||||
max_retries: int,
|
|
||||||
retry_delay: float,
|
|
||||||
max_workers: int,
|
|
||||||
check_url: str,
|
|
||||||
detailed_results: bool,
|
|
||||||
export_format: str,
|
|
||||||
user_agent: Optional[str] = None):
|
|
||||||
super().__init__()
|
|
||||||
self.proxy_urls = proxy_urls
|
|
||||||
self.timeout = timeout
|
|
||||||
self.max_retries = max_retries
|
|
||||||
self.retry_delay = retry_delay
|
|
||||||
self.max_workers = max_workers
|
|
||||||
self.check_url = check_url
|
|
||||||
self.detailed_results = detailed_results
|
|
||||||
self.export_format = export_format
|
|
||||||
self.user_agent = user_agent
|
|
||||||
self.checker: Optional[ProxyChecker] = None
|
|
||||||
|
|
||||||
def log_callback(self, message: str) -> None:
|
|
||||||
self.log_signal.emit(message)
|
|
||||||
|
|
||||||
def progress_callback(self, progress: int) -> None:
|
|
||||||
self.progress_update.emit(progress)
|
|
||||||
|
|
||||||
def cancel(self) -> None:
|
|
||||||
if self.checker is not None:
|
|
||||||
self.checker.cancel()
|
|
||||||
|
|
||||||
def run(self) -> None:
|
|
||||||
self.checker = ProxyChecker(
|
|
||||||
proxy_urls=self.proxy_urls,
|
|
||||||
timeout=self.timeout,
|
|
||||||
max_retries=self.max_retries,
|
|
||||||
retry_delay=self.retry_delay,
|
|
||||||
max_workers=self.max_workers,
|
|
||||||
check_url=self.check_url,
|
|
||||||
detailed_results=self.detailed_results,
|
|
||||||
export_format=self.export_format,
|
|
||||||
user_agent=self.user_agent,
|
|
||||||
log_callback=self.log_callback,
|
|
||||||
progress_callback=self.progress_callback
|
|
||||||
)
|
|
||||||
self.log_callback("Starting proxy checking...")
|
|
||||||
self.checker.run()
|
|
||||||
self.log_callback("Proxy checking finished.")
|
|
||||||
self.finished.emit()
|
|
||||||
|
|
||||||
class UpdateChecker(QObject):
|
|
||||||
"""
|
|
||||||
Worker class to check for software updates.
|
|
||||||
"""
|
|
||||||
update_checked = pyqtSignal(str)
|
|
||||||
|
|
||||||
def run(self) -> None:
|
|
||||||
try:
|
|
||||||
response = requests.get("https://api.github.com/repos/Jesewe/proxy-checker/releases/latest", timeout=5)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
latest_version = data["tag_name"].lstrip("v")
|
|
||||||
if latest_version != CURRENT_VERSION:
|
|
||||||
msg = (f"New version available: {latest_version}.\n"
|
|
||||||
f"You are using version {CURRENT_VERSION}.\n"
|
|
||||||
f"Visit {data['html_url']} to download the update.")
|
|
||||||
else:
|
|
||||||
msg = f"You are up-to-date with version {CURRENT_VERSION}."
|
|
||||||
except Exception as e:
|
|
||||||
msg = f"Failed to check for updates: {e}"
|
|
||||||
self.update_checked.emit(msg)
|
|
||||||
|
|
||||||
class MainWindow(QMainWindow):
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
self.setWindowTitle("Proxy Checker")
|
|
||||||
self.setGeometry(100, 100, 850, 750)
|
|
||||||
self.init_ui()
|
|
||||||
self.thread: Optional[QThread] = None
|
|
||||||
self.worker: Optional[ProxyCheckerWorker] = None
|
|
||||||
self.update_thread: Optional[QThread] = None
|
|
||||||
self.last_checker: Optional[ProxyChecker] = None
|
|
||||||
self.is_paused = False
|
|
||||||
|
|
||||||
def init_ui(self):
|
|
||||||
main_widget = QWidget()
|
|
||||||
main_layout = QVBoxLayout()
|
|
||||||
|
|
||||||
# Configuration group
|
|
||||||
config_group = QGroupBox("Settings")
|
|
||||||
config_layout = QGridLayout()
|
|
||||||
|
|
||||||
# Timeout
|
|
||||||
config_layout.addWidget(QLabel("Timeout (s):"), 0, 0)
|
|
||||||
self.timeout_spin = QSpinBox()
|
|
||||||
self.timeout_spin.setRange(1, 60)
|
|
||||||
self.timeout_spin.setValue(3)
|
|
||||||
config_layout.addWidget(self.timeout_spin, 0, 1)
|
|
||||||
|
|
||||||
# Max Retries
|
|
||||||
config_layout.addWidget(QLabel("Max Retries:"), 0, 2)
|
|
||||||
self.retries_spin = QSpinBox()
|
|
||||||
self.retries_spin.setRange(1, 10)
|
|
||||||
self.retries_spin.setValue(3)
|
|
||||||
config_layout.addWidget(self.retries_spin, 0, 3)
|
|
||||||
|
|
||||||
# Retry Delay
|
|
||||||
config_layout.addWidget(QLabel("Retry Delay (s):"), 1, 0)
|
|
||||||
self.retry_delay_spin = QDoubleSpinBox()
|
|
||||||
self.retry_delay_spin.setRange(0.1, 10.0)
|
|
||||||
self.retry_delay_spin.setSingleStep(0.1)
|
|
||||||
self.retry_delay_spin.setValue(1.0)
|
|
||||||
config_layout.addWidget(self.retry_delay_spin, 1, 1)
|
|
||||||
|
|
||||||
# Max Workers
|
|
||||||
config_layout.addWidget(QLabel("Max Workers:"), 1, 2)
|
|
||||||
self.workers_spin = QSpinBox()
|
|
||||||
self.workers_spin.setRange(1, 200)
|
|
||||||
self.workers_spin.setValue(50)
|
|
||||||
config_layout.addWidget(self.workers_spin, 1, 3)
|
|
||||||
|
|
||||||
# Test URL
|
|
||||||
config_layout.addWidget(QLabel("Test URL:"), 2, 0)
|
|
||||||
self.test_url_edit = QLineEdit("http://www.google.com")
|
|
||||||
config_layout.addWidget(self.test_url_edit, 2, 1, 1, 3)
|
|
||||||
|
|
||||||
# Custom User-Agent
|
|
||||||
config_layout.addWidget(QLabel("Custom User-Agent:"), 3, 0)
|
|
||||||
self.user_agent_edit = QLineEdit("")
|
|
||||||
self.user_agent_edit.setPlaceholderText("Leave blank for default")
|
|
||||||
config_layout.addWidget(self.user_agent_edit, 3, 1, 1, 3)
|
|
||||||
|
|
||||||
# Detailed Results Option
|
|
||||||
self.detailed_checkbox = QCheckBox("Detailed Results (Include Response Time, Anonymity & Geo)")
|
|
||||||
config_layout.addWidget(self.detailed_checkbox, 4, 0, 1, 2)
|
|
||||||
|
|
||||||
# Export Format Option
|
|
||||||
config_layout.addWidget(QLabel("Export Format:"), 4, 2)
|
|
||||||
self.export_format_combo = QComboBox()
|
|
||||||
self.export_format_combo.addItems(["txt", "csv", "json"])
|
|
||||||
config_layout.addWidget(self.export_format_combo, 4, 3)
|
|
||||||
|
|
||||||
config_group.setLayout(config_layout)
|
|
||||||
main_layout.addWidget(config_group)
|
|
||||||
|
|
||||||
# Proxy Sources Group
|
|
||||||
proxy_group = QGroupBox("Proxy Sources")
|
|
||||||
proxy_layout = QGridLayout()
|
|
||||||
self.proxy_urls = {
|
|
||||||
"http": "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt",
|
|
||||||
"socks4": "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt",
|
|
||||||
"socks5": "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt"
|
|
||||||
}
|
|
||||||
self.proxy_type_checkboxes = {}
|
|
||||||
self.proxy_url_edits = {}
|
|
||||||
row = 0
|
|
||||||
for proxy_type, url in self.proxy_urls.items():
|
|
||||||
checkbox = QCheckBox(proxy_type)
|
|
||||||
checkbox.setChecked(True)
|
|
||||||
self.proxy_type_checkboxes[proxy_type] = checkbox
|
|
||||||
proxy_layout.addWidget(checkbox, row, 0)
|
|
||||||
url_edit = QLineEdit(url)
|
|
||||||
self.proxy_url_edits[proxy_type] = url_edit
|
|
||||||
proxy_layout.addWidget(url_edit, row, 1)
|
|
||||||
row += 1
|
|
||||||
proxy_group.setLayout(proxy_layout)
|
|
||||||
main_layout.addWidget(proxy_group)
|
|
||||||
|
|
||||||
# Progress Bar
|
|
||||||
self.progress_bar = QProgressBar()
|
|
||||||
self.progress_bar.setRange(0, 100)
|
|
||||||
self.progress_bar.setValue(0)
|
|
||||||
main_layout.addWidget(self.progress_bar)
|
|
||||||
|
|
||||||
# Main Buttons
|
|
||||||
btn_layout = QHBoxLayout()
|
|
||||||
self.start_btn = QPushButton("Start Checking")
|
|
||||||
self.start_btn.clicked.connect(self.start_checking)
|
|
||||||
btn_layout.addWidget(self.start_btn)
|
|
||||||
|
|
||||||
self.pause_btn = QPushButton("Pause")
|
|
||||||
self.pause_btn.setEnabled(False)
|
|
||||||
self.pause_btn.clicked.connect(self.toggle_pause)
|
|
||||||
btn_layout.addWidget(self.pause_btn)
|
|
||||||
|
|
||||||
self.cancel_btn = QPushButton("Cancel")
|
|
||||||
self.cancel_btn.setEnabled(False)
|
|
||||||
self.cancel_btn.clicked.connect(self.cancel_checking)
|
|
||||||
btn_layout.addWidget(self.cancel_btn)
|
|
||||||
|
|
||||||
self.show_results_btn = QPushButton("Show Results")
|
|
||||||
self.show_results_btn.setEnabled(False)
|
|
||||||
self.show_results_btn.clicked.connect(self.show_results)
|
|
||||||
btn_layout.addWidget(self.show_results_btn)
|
|
||||||
main_layout.addLayout(btn_layout)
|
|
||||||
|
|
||||||
# Extra Buttons: Show Statistics, Save Log
|
|
||||||
extra_btn_layout = QHBoxLayout()
|
|
||||||
self.show_stats_btn = QPushButton("Show Statistics")
|
|
||||||
self.show_stats_btn.setEnabled(False)
|
|
||||||
self.show_stats_btn.clicked.connect(self.show_statistics)
|
|
||||||
extra_btn_layout.addWidget(self.show_stats_btn)
|
|
||||||
|
|
||||||
self.save_log_btn = QPushButton("Save Log")
|
|
||||||
self.save_log_btn.clicked.connect(self.save_log)
|
|
||||||
extra_btn_layout.addWidget(self.save_log_btn)
|
|
||||||
main_layout.addLayout(extra_btn_layout)
|
|
||||||
|
|
||||||
# Log Text Area
|
|
||||||
self.log_text = QTextEdit()
|
|
||||||
self.log_text.setReadOnly(True)
|
|
||||||
self.log_text.setStyleSheet("background-color: #1e1e1e; color: #d4d4d4; font-family: Consolas; font-size: 12pt;")
|
|
||||||
main_layout.addWidget(self.log_text)
|
|
||||||
|
|
||||||
main_widget.setLayout(main_layout)
|
|
||||||
self.setCentralWidget(main_widget)
|
|
||||||
|
|
||||||
def start_checking(self):
|
|
||||||
self.start_btn.setEnabled(False)
|
|
||||||
self.cancel_btn.setEnabled(True)
|
|
||||||
self.pause_btn.setEnabled(True)
|
|
||||||
self.show_results_btn.setEnabled(False)
|
|
||||||
self.show_stats_btn.setEnabled(False)
|
|
||||||
self.progress_bar.setValue(0)
|
|
||||||
self.log_text.clear()
|
|
||||||
|
|
||||||
# Build proxy_urls from selected checkboxes.
|
|
||||||
selected_proxy_urls = {}
|
|
||||||
for proxy_type, checkbox in self.proxy_type_checkboxes.items():
|
|
||||||
if checkbox.isChecked():
|
|
||||||
url = self.proxy_url_edits[proxy_type].text().strip()
|
|
||||||
if url:
|
|
||||||
selected_proxy_urls[proxy_type] = url
|
|
||||||
|
|
||||||
if not selected_proxy_urls:
|
|
||||||
QMessageBox.warning(self, "No Proxies Selected", "Please select at least one proxy type to check.")
|
|
||||||
self.start_btn.setEnabled(True)
|
|
||||||
self.cancel_btn.setEnabled(False)
|
|
||||||
self.pause_btn.setEnabled(False)
|
|
||||||
return
|
|
||||||
|
|
||||||
# Get settings from UI.
|
|
||||||
timeout = self.timeout_spin.value()
|
|
||||||
max_retries = self.retries_spin.value()
|
|
||||||
retry_delay = self.retry_delay_spin.value()
|
|
||||||
max_workers = self.workers_spin.value()
|
|
||||||
check_url = self.test_url_edit.text().strip()
|
|
||||||
detailed_results = self.detailed_checkbox.isChecked()
|
|
||||||
export_format = self.export_format_combo.currentText().strip()
|
|
||||||
user_agent = self.user_agent_edit.text().strip() or None
|
|
||||||
|
|
||||||
self.thread = QThread()
|
|
||||||
self.worker = ProxyCheckerWorker(
|
|
||||||
proxy_urls=selected_proxy_urls,
|
|
||||||
timeout=timeout,
|
|
||||||
max_retries=max_retries,
|
|
||||||
retry_delay=retry_delay,
|
|
||||||
max_workers=max_workers,
|
|
||||||
check_url=check_url,
|
|
||||||
detailed_results=detailed_results,
|
|
||||||
export_format=export_format,
|
|
||||||
user_agent=user_agent
|
|
||||||
)
|
|
||||||
self.worker.moveToThread(self.thread)
|
|
||||||
self.worker.log_signal.connect(self.append_log)
|
|
||||||
self.worker.progress_update.connect(self.progress_bar.setValue)
|
|
||||||
self.worker.finished.connect(self.on_finished)
|
|
||||||
self.thread.started.connect(self.worker.run)
|
|
||||||
self.thread.finished.connect(self.thread.deleteLater)
|
|
||||||
self.thread.start()
|
|
||||||
|
|
||||||
def toggle_pause(self):
|
|
||||||
if self.worker and self.worker.checker:
|
|
||||||
if not self.is_paused:
|
|
||||||
self.worker.checker.pause()
|
|
||||||
self.is_paused = True
|
|
||||||
self.pause_btn.setText("Resume")
|
|
||||||
self.append_log("Paused proxy checking.")
|
|
||||||
else:
|
|
||||||
self.worker.checker.resume()
|
|
||||||
self.is_paused = False
|
|
||||||
self.pause_btn.setText("Pause")
|
|
||||||
self.append_log("Resumed proxy checking.")
|
|
||||||
|
|
||||||
def cancel_checking(self):
|
|
||||||
if self.worker is not None:
|
|
||||||
self.append_log("Cancel requested by user...")
|
|
||||||
self.worker.cancel()
|
|
||||||
self.cancel_btn.setEnabled(False)
|
|
||||||
|
|
||||||
def append_log(self, message: str):
|
|
||||||
timestamp = time.strftime("%H:%M:%S")
|
|
||||||
self.log_text.append(f"[{timestamp}] {message}")
|
|
||||||
|
|
||||||
def on_finished(self):
|
|
||||||
self.append_log("All tasks completed.")
|
|
||||||
self.start_btn.setEnabled(True)
|
|
||||||
self.cancel_btn.setEnabled(False)
|
|
||||||
self.pause_btn.setEnabled(False)
|
|
||||||
self.show_results_btn.setEnabled(True)
|
|
||||||
self.show_stats_btn.setEnabled(True)
|
|
||||||
if self.thread is not None:
|
|
||||||
self.thread.quit()
|
|
||||||
self.thread.wait()
|
|
||||||
# Save a reference to the last checker for filtering results.
|
|
||||||
if self.worker:
|
|
||||||
self.last_checker = self.worker.checker
|
|
||||||
|
|
||||||
def show_results(self):
|
|
||||||
# If detailed results are enabled, allow filtering by response time.
|
|
||||||
if self.last_checker and self.last_checker.detailed_results:
|
|
||||||
dialog = QDialog(self)
|
|
||||||
dialog.setWindowTitle("Filtered Working Proxies")
|
|
||||||
dialog.resize(600, 500)
|
|
||||||
layout = QVBoxLayout()
|
|
||||||
|
|
||||||
filter_layout = QHBoxLayout()
|
|
||||||
filter_layout.addWidget(QLabel("Max Response Time (s):"))
|
|
||||||
filter_spin = QDoubleSpinBox()
|
|
||||||
filter_spin.setRange(0.1, 10.0)
|
|
||||||
filter_spin.setSingleStep(0.1)
|
|
||||||
filter_spin.setValue(1.0)
|
|
||||||
filter_layout.addWidget(filter_spin)
|
|
||||||
apply_btn = QPushButton("Apply Filter")
|
|
||||||
filter_layout.addWidget(apply_btn)
|
|
||||||
layout.addLayout(filter_layout)
|
|
||||||
|
|
||||||
result_area = QTextEdit()
|
|
||||||
result_area.setReadOnly(True)
|
|
||||||
layout.addWidget(result_area)
|
|
||||||
|
|
||||||
def apply_filter():
|
|
||||||
threshold = filter_spin.value()
|
|
||||||
text = ""
|
|
||||||
for ptype, results in self.last_checker.working_results.items():
|
|
||||||
filtered = []
|
|
||||||
for item in results:
|
|
||||||
if isinstance(item, dict) and item.get("response_time") <= threshold:
|
|
||||||
geo = item.get("geo", {})
|
|
||||||
filtered.append(f"{item.get('proxy')} - {item.get('response_time'):.2f} s - {item.get('anonymity')} - {geo.get('country', '')}")
|
|
||||||
if filtered:
|
|
||||||
text += f"--- {ptype} ---\n" + "\n".join(filtered) + "\n\n"
|
|
||||||
result_area.setText(text if text else "No proxies match the filter criteria.")
|
|
||||||
|
|
||||||
apply_btn.clicked.connect(apply_filter)
|
|
||||||
# Show all results initially
|
|
||||||
apply_filter()
|
|
||||||
|
|
||||||
btn_layout = QHBoxLayout()
|
|
||||||
copy_btn = QPushButton("Copy to Clipboard")
|
|
||||||
copy_btn.clicked.connect(lambda: QApplication.clipboard().setText(result_area.toPlainText()))
|
|
||||||
btn_layout.addWidget(copy_btn)
|
|
||||||
close_btn = QPushButton("Close")
|
|
||||||
close_btn.clicked.connect(dialog.close)
|
|
||||||
btn_layout.addWidget(close_btn)
|
|
||||||
layout.addLayout(btn_layout)
|
|
||||||
|
|
||||||
dialog.setLayout(layout)
|
|
||||||
dialog.exec()
|
|
||||||
else:
|
|
||||||
# Fallback: read the exported files from the proxies directory.
|
|
||||||
results_text = ""
|
|
||||||
proxy_dir = "proxies"
|
|
||||||
if os.path.isdir(proxy_dir):
|
|
||||||
for filename in os.listdir(proxy_dir):
|
|
||||||
filepath = os.path.join(proxy_dir, filename)
|
|
||||||
results_text += f"--- {filename} ---\n"
|
|
||||||
try:
|
|
||||||
with open(filepath, 'r') as f:
|
|
||||||
results_text += f.read() + "\n"
|
|
||||||
except OSError as e:
|
|
||||||
results_text += f"Error reading file: {e}\n"
|
|
||||||
else:
|
|
||||||
results_text = "No results found."
|
|
||||||
|
|
||||||
dialog = QDialog(self)
|
|
||||||
dialog.setWindowTitle("Working Proxies")
|
|
||||||
dialog.resize(600, 400)
|
|
||||||
dlg_layout = QVBoxLayout()
|
|
||||||
text_area = QTextEdit()
|
|
||||||
text_area.setReadOnly(True)
|
|
||||||
text_area.setText(results_text)
|
|
||||||
dlg_layout.addWidget(text_area)
|
|
||||||
|
|
||||||
btn_layout = QHBoxLayout()
|
|
||||||
copy_btn = QPushButton("Copy to Clipboard")
|
|
||||||
copy_btn.clicked.connect(lambda: QApplication.clipboard().setText(results_text))
|
|
||||||
btn_layout.addWidget(copy_btn)
|
|
||||||
close_btn = QPushButton("Close")
|
|
||||||
close_btn.clicked.connect(dialog.close)
|
|
||||||
btn_layout.addWidget(close_btn)
|
|
||||||
dlg_layout.addLayout(btn_layout)
|
|
||||||
dialog.setLayout(dlg_layout)
|
|
||||||
dialog.exec()
|
|
||||||
|
|
||||||
def show_statistics(self):
|
|
||||||
if self.worker and self.worker.checker:
|
|
||||||
stats = self.worker.checker.get_statistics()
|
|
||||||
else:
|
|
||||||
stats = "No statistics available."
|
|
||||||
QMessageBox.information(self, "Statistics", stats)
|
|
||||||
|
|
||||||
def save_log(self):
|
|
||||||
filename, _ = QFileDialog.getSaveFileName(self, "Save Log", "", "Text Files (*.txt);;All Files (*)")
|
|
||||||
if filename:
|
|
||||||
try:
|
|
||||||
with open(filename, 'w') as f:
|
|
||||||
f.write(self.log_text.toPlainText())
|
|
||||||
QMessageBox.information(self, "Saved", f"Log saved to {filename}")
|
|
||||||
except OSError as e:
|
|
||||||
QMessageBox.warning(self, "Error", f"Failed to save log: {e}")
|
|
||||||
|
|
||||||
def auto_check_for_update(self):
|
|
||||||
self.update_thread = QThread()
|
|
||||||
self.update_worker = UpdateChecker()
|
|
||||||
self.update_worker.moveToThread(self.update_thread)
|
|
||||||
self.update_worker.update_checked.connect(self.show_update_message)
|
|
||||||
self.update_thread.started.connect(self.update_worker.run)
|
|
||||||
self.update_thread.start()
|
|
||||||
|
|
||||||
def show_update_message(self, msg: str):
|
|
||||||
QMessageBox.information(self, "Update Check", msg)
|
|
||||||
self.update_thread.quit()
|
|
||||||
self.update_thread.wait()
|
|
||||||
|
|
||||||
def showEvent(self, event):
|
|
||||||
super().showEvent(event)
|
|
||||||
QTimer.singleShot(1000, self.auto_check_for_update)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
||||||
app = QApplication(sys.argv)
|
|
||||||
window = MainWindow()
|
|
||||||
window.show()
|
|
||||||
sys.exit(app.exec())
|
|
||||||
@ -1,941 +0,0 @@
|
|||||||
from airflow import DAG
|
|
||||||
from airflow.models import BaseOperator, Variable
|
|
||||||
from airflow.utils.decorators import apply_defaults
|
|
||||||
from airflow.hooks.base import BaseHook
|
|
||||||
from airflow.exceptions import AirflowException
|
|
||||||
from airflow.utils.dates import days_ago
|
|
||||||
from thrift.transport import TSocket, TTransport
|
|
||||||
from thrift.protocol import TBinaryProtocol
|
|
||||||
from thrift.transport.TTransport import TTransportException
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
from pangramia.yt.exceptions.ttypes import PBServiceException
|
|
||||||
import redis
|
|
||||||
import logging
|
|
||||||
import time
|
|
||||||
import socket
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from pangramia.yt.tokens_ops import YTTokenOpService
|
|
||||||
from pangramia.yt.common.ttypes import TokenUpdateMode
|
|
||||||
from airflow.providers.redis.hooks.redis import RedisHook
|
|
||||||
from airflow.operators.python import PythonOperator
|
|
||||||
from airflow.models.param import Param
|
|
||||||
# Assuming ytdlp_utils exists in the same directory or PYTHONPATH
|
|
||||||
# from ytdlp_utils import get_info_json, is_valid_json, extract_video_id
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Default settings (similar to ytdlp_client_dag.py)
|
|
||||||
MAX_RETRIES = 1
|
|
||||||
RETRY_DELAY = timedelta(seconds=10)
|
|
||||||
DEFAULT_TIMEOUT = 30
|
|
||||||
|
|
||||||
class YtdlpOpsOperator(BaseOperator):
|
|
||||||
"""
|
|
||||||
Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections
|
|
||||||
and Redis-based discovery, retrieves tokens, saves info.json, and manages errors.
|
|
||||||
"""
|
|
||||||
template_fields = ('url', 'service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir')
|
|
||||||
|
|
||||||
@apply_defaults
|
|
||||||
def __init__(self, url, redis_conn_id='redis_default', max_retries=3, retry_delay=10,
|
|
||||||
service_ip=None, service_port=None, redis_enabled=False, account_id=None,
|
|
||||||
save_info_json=True, info_json_dir=None, get_socks_proxy=True,
|
|
||||||
store_socks_proxy=False, timeout=DEFAULT_TIMEOUT, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
logger.info(f"Initializing YtdlpOpsOperator with parameters: url={url}, "
|
|
||||||
f"redis_conn_id={redis_conn_id}, max_retries={max_retries}, retry_delay={retry_delay}, "
|
|
||||||
f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, "
|
|
||||||
f"account_id={account_id}, save_info_json={save_info_json}, info_json_dir={info_json_dir}, "
|
|
||||||
f"get_socks_proxy={get_socks_proxy}, store_socks_proxy={store_socks_proxy}, timeout={timeout}")
|
|
||||||
|
|
||||||
# Validate required parameters
|
|
||||||
if not url:
|
|
||||||
raise ValueError("url is required")
|
|
||||||
|
|
||||||
# Validate parameters based on connection mode
|
|
||||||
if redis_enabled:
|
|
||||||
if not account_id:
|
|
||||||
raise ValueError("account_id is required when redis_enabled=True")
|
|
||||||
# Use default Redis connection if not specified
|
|
||||||
if not redis_conn_id:
|
|
||||||
redis_conn_id = 'redis_default'
|
|
||||||
logger.info(f"Using default Redis connection ID: {redis_conn_id}")
|
|
||||||
else:
|
|
||||||
if not service_ip or not service_port:
|
|
||||||
raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False")
|
|
||||||
if not account_id:
|
|
||||||
logger.warning("No account_id provided for direct connection mode. Using 'default'")
|
|
||||||
account_id = 'default' # Assign default if missing in direct mode
|
|
||||||
|
|
||||||
self.url = url
|
|
||||||
self.redis_conn_id = redis_conn_id
|
|
||||||
self.max_retries = max_retries
|
|
||||||
self.retry_delay = int(retry_delay.total_seconds() if isinstance(retry_delay, timedelta) else retry_delay)
|
|
||||||
self.service_ip = service_ip
|
|
||||||
self.service_port = service_port
|
|
||||||
self.redis_enabled = redis_enabled
|
|
||||||
self.account_id = account_id
|
|
||||||
self.save_info_json = save_info_json
|
|
||||||
self.info_json_dir = info_json_dir
|
|
||||||
self.get_socks_proxy = get_socks_proxy
|
|
||||||
self.store_socks_proxy = store_socks_proxy
|
|
||||||
self.timeout = timeout
|
|
||||||
|
|
||||||
def execute(self, context):
|
|
||||||
logger.info("Executing YtdlpOpsOperator")
|
|
||||||
transport = None
|
|
||||||
try:
|
|
||||||
logger.info("Getting task parameters")
|
|
||||||
params = context.get('params', {})
|
|
||||||
redis_enabled = params.get('redis_enabled', self.redis_enabled)
|
|
||||||
logger.info(f"Using redis_enabled={redis_enabled} (from {'task params' if 'redis_enabled' in params else 'operator init'})")
|
|
||||||
|
|
||||||
# Determine account_id to use (from params or operator default)
|
|
||||||
account_id = context['params'].get('account_id', self.account_id)
|
|
||||||
logger.info(f"Using account_id='{account_id}' (from {'task params' if 'account_id' in params else 'operator init'})")
|
|
||||||
|
|
||||||
if redis_enabled:
|
|
||||||
# Get Redis connection with proper authentication and error handling
|
|
||||||
redis_conn = BaseHook.get_connection(self.redis_conn_id)
|
|
||||||
redis_client = redis.Redis(
|
|
||||||
host=redis_conn.host,
|
|
||||||
port=redis_conn.port,
|
|
||||||
password=redis_conn.password,
|
|
||||||
db=0,
|
|
||||||
decode_responses=True # Important for consistent key handling
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test Redis connection
|
|
||||||
try:
|
|
||||||
if not redis_client.ping():
|
|
||||||
raise redis.exceptions.ConnectionError("Redis ping failed")
|
|
||||||
logger.info(f"Successfully connected to Redis at {redis_conn.host}:{redis_conn.port}")
|
|
||||||
except redis.exceptions.AuthenticationError:
|
|
||||||
logger.error(f"Redis authentication failed for connection '{self.redis_conn_id}'. Check password.")
|
|
||||||
raise AirflowException("Redis authentication failed.")
|
|
||||||
except redis.exceptions.ConnectionError as e:
|
|
||||||
logger.error(f"Could not connect to Redis at {redis_conn.host}:{redis_conn.port}. Error: {e}")
|
|
||||||
raise AirflowException(f"Redis connection failed: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected Redis error: {str(e)}")
|
|
||||||
raise AirflowException(f"Unexpected Redis error: {e}")
|
|
||||||
|
|
||||||
# Get service details from Redis with retries and proper key handling
|
|
||||||
service_key = f"ytdlp:{account_id}"
|
|
||||||
legacy_key = account_id # For backward compatibility
|
|
||||||
|
|
||||||
host = None
|
|
||||||
port = None
|
|
||||||
for attempt in range(self.max_retries):
|
|
||||||
try:
|
|
||||||
logger.info(f"Attempt {attempt + 1}/{self.max_retries}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'")
|
|
||||||
service_details = redis_client.hgetall(service_key)
|
|
||||||
if not service_details:
|
|
||||||
logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'")
|
|
||||||
service_details = redis_client.hgetall(legacy_key)
|
|
||||||
|
|
||||||
if not service_details:
|
|
||||||
raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}")
|
|
||||||
|
|
||||||
# Find IP and port, handling potential case differences and byte/string types
|
|
||||||
ip_key = next((k for k in service_details if k.lower() == 'ip'), None)
|
|
||||||
port_key = next((k for k in service_details if k.lower() == 'port'), None)
|
|
||||||
|
|
||||||
if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}")
|
|
||||||
if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}")
|
|
||||||
|
|
||||||
host = service_details[ip_key] # Already decoded due to decode_responses=True
|
|
||||||
port_str = service_details[port_key]
|
|
||||||
|
|
||||||
try:
|
|
||||||
port = int(port_str)
|
|
||||||
except ValueError:
|
|
||||||
raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}")
|
|
||||||
|
|
||||||
logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}")
|
|
||||||
break # Success
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}")
|
|
||||||
if attempt == self.max_retries - 1:
|
|
||||||
logger.error("Max retries reached for fetching Redis details.")
|
|
||||||
raise AirflowException(f"Failed to get service details from Redis after {self.max_retries} attempts: {e}")
|
|
||||||
logger.info(f"Retrying in {self.retry_delay} seconds...")
|
|
||||||
time.sleep(self.retry_delay)
|
|
||||||
else:
|
|
||||||
# Direct connection: Log parameter sources
|
|
||||||
params = context.get('params', {})
|
|
||||||
host = params.get('service_ip', self.service_ip)
|
|
||||||
host_source = 'task params' if 'service_ip' in params else 'operator init'
|
|
||||||
port_str = params.get('service_port', self.service_port)
|
|
||||||
port_source = 'task params' if 'service_port' in params else 'operator init'
|
|
||||||
url = params.get('url', self.url)
|
|
||||||
url_source = 'task params' if 'url' in params else 'operator init'
|
|
||||||
|
|
||||||
logger.info(f"Using service_ip={host} (from {host_source})")
|
|
||||||
logger.info(f"Using service_port={port_str} (from {port_source})")
|
|
||||||
logger.info(f"Using url={url} (from {url_source})")
|
|
||||||
|
|
||||||
if not host or not port_str:
|
|
||||||
raise ValueError("Direct connection requires service_ip and service_port")
|
|
||||||
try:
|
|
||||||
port = int(port_str)
|
|
||||||
except ValueError:
|
|
||||||
raise ValueError(f"Invalid service_port value: {port_str}")
|
|
||||||
|
|
||||||
logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)")
|
|
||||||
|
|
||||||
# Render and validate timeout
|
|
||||||
timeout_param = context.get('params', {}).get('timeout', self.timeout)
|
|
||||||
if isinstance(self.timeout, str) and '{{' in self.timeout:
|
|
||||||
timeout_rendered = self.render_template(self.timeout, context)
|
|
||||||
logger.info(f"Rendered timeout template: '{self.timeout}' -> '{timeout_rendered}'")
|
|
||||||
timeout_param = timeout_rendered
|
|
||||||
try:
|
|
||||||
timeout = int(timeout_param)
|
|
||||||
if timeout <= 0: raise ValueError("Timeout must be positive")
|
|
||||||
logger.info(f"Using timeout: {timeout} seconds")
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
logger.warning(f"Invalid timeout value: '{timeout_param}'. Using default: {DEFAULT_TIMEOUT}")
|
|
||||||
timeout = DEFAULT_TIMEOUT
|
|
||||||
|
|
||||||
# Create Thrift connection objects
|
|
||||||
socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4)
|
|
||||||
socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds
|
|
||||||
transport = TTransport.TFramedTransport(socket_conn)
|
|
||||||
protocol = TBinaryProtocol.TBinaryProtocol(transport)
|
|
||||||
client = YTTokenOpService.Client(protocol)
|
|
||||||
|
|
||||||
logger.info(f"Attempting to connect to Thrift server at {host}:{port}...")
|
|
||||||
try:
|
|
||||||
transport.open()
|
|
||||||
logger.info("Successfully connected to Thrift server.")
|
|
||||||
|
|
||||||
# Test connection with ping
|
|
||||||
try:
|
|
||||||
client.ping()
|
|
||||||
logger.info("Server ping successful.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Server ping failed: {e}")
|
|
||||||
raise AirflowException(f"Server connection test (ping) failed: {e}")
|
|
||||||
|
|
||||||
# Get token from service with specific error handling
|
|
||||||
try:
|
|
||||||
url_param = context.get('params', {}).get('url', self.url)
|
|
||||||
logger.info(f"Requesting token for accountId='{account_id}', url='{url_param}'")
|
|
||||||
token_data = client.getOrRefreshToken(
|
|
||||||
accountId=account_id,
|
|
||||||
updateType=TokenUpdateMode.AUTO,
|
|
||||||
url=url_param
|
|
||||||
)
|
|
||||||
logger.info("Successfully retrieved token data from service.")
|
|
||||||
except PBServiceException as e:
|
|
||||||
logger.error(f"PBServiceException occurred: Code={getattr(e, 'errorCode', 'N/A')}, Message={getattr(e, 'message', 'N/A')}")
|
|
||||||
error_code = getattr(e, 'errorCode', None)
|
|
||||||
error_msg = f"YTDLP service error: {getattr(e, 'message', str(e))}"
|
|
||||||
# Handle specific known error codes
|
|
||||||
if error_code in [
|
|
||||||
"SOCKS5_CONNECTION_FAILED", "SOCKS5_TIMEOUT",
|
|
||||||
"SOCKS5_CONNECTION_REFUSED", "SOCKS5_CONNECTION_TIMEOUT",
|
|
||||||
"SOCKS5_HOST_NOT_FOUND", "SOCKS5_NETWORK_UNREACHABLE"
|
|
||||||
]:
|
|
||||||
error_msg = f"SOCKS5 proxy error ({error_code}): {e.message}. Check proxy settings."
|
|
||||||
elif error_code == "BOT_DETECTION":
|
|
||||||
error_msg = f"Bot detection triggered ({error_code}): {e.message}."
|
|
||||||
suggestions = getattr(e, 'context', {}).get('suggestions', [])
|
|
||||||
if suggestions: error_msg += "\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions)
|
|
||||||
elif error_code == "NODEJS_SCRIPT_ERROR":
|
|
||||||
error_msg = f"Node.js script error ({error_code}): {e.message}."
|
|
||||||
elif error_code == "NODEJS_TIMEOUT":
|
|
||||||
error_msg = f"Node.js timeout ({error_code}): {e.message}."
|
|
||||||
# Add more specific error handling as needed
|
|
||||||
raise AirflowException(error_msg)
|
|
||||||
except TTransportException as e:
|
|
||||||
logger.error(f"Thrift transport error during getOrRefreshToken: {e}")
|
|
||||||
raise AirflowException(f"Transport error during API call: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error during getOrRefreshToken: {e}")
|
|
||||||
raise AirflowException(f"Unexpected error during API call: {e}")
|
|
||||||
|
|
||||||
except TTransportException as e:
|
|
||||||
# Handle connection-specific transport errors
|
|
||||||
if "read 0 bytes" in str(e) or "Could not connect to" in str(e) or "Connection refused" in str(e):
|
|
||||||
logger.error(f"Connection failed to {host}:{port}. Details: {e}")
|
|
||||||
logger.error("Possible causes: Server down, firewall block, incorrect IP/port.")
|
|
||||||
raise AirflowException(f"Failed to connect to YTDLP service at {host}:{port}: {e}")
|
|
||||||
else:
|
|
||||||
logger.error(f"Thrift transport error during connection: {str(e)}")
|
|
||||||
raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error during connection or ping: {str(e)}")
|
|
||||||
raise # Re-raise other unexpected errors
|
|
||||||
|
|
||||||
# Log received token data attributes for debugging
|
|
||||||
logger.debug(f"Token data received. Attributes: {dir(token_data)}")
|
|
||||||
for attr in dir(token_data):
|
|
||||||
if not attr.startswith('__') and not callable(getattr(token_data, attr)): # Log non-callable attributes
|
|
||||||
value = getattr(token_data, attr)
|
|
||||||
if attr == 'infoJson' and value:
|
|
||||||
logger.debug(f"infoJson: {value[:50]}...")
|
|
||||||
else:
|
|
||||||
logger.debug(f"{attr}: {value}")
|
|
||||||
|
|
||||||
info_json_path = None # Initialize info_json_path
|
|
||||||
|
|
||||||
save_info_json_param = context['params'].get('save_info_json', self.save_info_json)
|
|
||||||
# Render if it's a string template
|
|
||||||
if isinstance(save_info_json_param, str):
|
|
||||||
save_info_json_rendered = self.render_template(save_info_json_param, context)
|
|
||||||
# Convert common string representations to boolean
|
|
||||||
save_info_json = str(save_info_json_rendered).lower() in ['true', '1', 't', 'y', 'yes']
|
|
||||||
else:
|
|
||||||
save_info_json = bool(save_info_json_param)
|
|
||||||
|
|
||||||
|
|
||||||
# Save info.json if requested and valid
|
|
||||||
if self.save_info_json:
|
|
||||||
info_json = self._get_info_json(token_data)
|
|
||||||
if info_json and self._is_valid_json(info_json):
|
|
||||||
try:
|
|
||||||
# Use internal _save_info_json method which handles rendering, dir creation, logging
|
|
||||||
info_json_path = self._save_info_json(context, info_json)
|
|
||||||
if info_json_path: # Check if saving was successful
|
|
||||||
context['task_instance'].xcom_push(key='info_json_path', value=info_json_path)
|
|
||||||
logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}")
|
|
||||||
else:
|
|
||||||
# _save_info_json should log errors, push None to indicate failure
|
|
||||||
context['task_instance'].xcom_push(key='info_json_path', value=None)
|
|
||||||
logger.warning("info.json saving failed (check logs from _save_info_json), pushing None to XCom for info_json_path.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True)
|
|
||||||
context['task_instance'].xcom_push(key='info_json_path', value=None) # Push None on error
|
|
||||||
elif info_json:
|
|
||||||
logger.warning("Retrieved infoJson is not valid JSON. Skipping save.")
|
|
||||||
context['task_instance'].xcom_push(key='info_json_path', value=None)
|
|
||||||
else:
|
|
||||||
logger.info("No infoJson found in token data. Skipping save.")
|
|
||||||
context['task_instance'].xcom_push(key='info_json_path', value=None)
|
|
||||||
else:
|
|
||||||
logger.info("save_info_json is False. Skipping info.json save.")
|
|
||||||
context['task_instance'].xcom_push(key='info_json_path', value=None)
|
|
||||||
|
|
||||||
|
|
||||||
# Extract and potentially store SOCKS proxy
|
|
||||||
socks_proxy = None
|
|
||||||
if self.get_socks_proxy: # Use instance attribute
|
|
||||||
# Check for common attribute names for proxy
|
|
||||||
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
|
|
||||||
if proxy_attr:
|
|
||||||
socks_proxy = getattr(token_data, proxy_attr)
|
|
||||||
if socks_proxy: # Ensure proxy value is not empty
|
|
||||||
logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
|
|
||||||
if self.store_socks_proxy: # Use instance attribute
|
|
||||||
context['task_instance'].xcom_push(key='socks_proxy', value=socks_proxy)
|
|
||||||
logger.info(f"Pushed key 'socks_proxy' to XCom with value: {socks_proxy}")
|
|
||||||
else:
|
|
||||||
logger.info("SOCKS proxy extracted but not pushed to XCom (store_socks_proxy=False).")
|
|
||||||
else:
|
|
||||||
logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty. No proxy extracted.")
|
|
||||||
# Push None even if found but empty, if storing is enabled
|
|
||||||
if self.store_socks_proxy: # Use instance attribute
|
|
||||||
context['task_instance'].xcom_push(key='socks_proxy', value=None)
|
|
||||||
logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.")
|
|
||||||
else:
|
|
||||||
logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found in token data.")
|
|
||||||
# Push None if storing is enabled but attribute not found
|
|
||||||
if self.store_socks_proxy: # Use instance attribute
|
|
||||||
context['task_instance'].xcom_push(key='socks_proxy', value=None)
|
|
||||||
logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.")
|
|
||||||
else:
|
|
||||||
logger.info("get_socks_proxy is False. Skipping proxy extraction.")
|
|
||||||
# Push None if storing is enabled but extraction was skipped
|
|
||||||
if self.store_socks_proxy: # Use instance attribute
|
|
||||||
context['task_instance'].xcom_push(key='socks_proxy', value=None)
|
|
||||||
logger.info("Pushed None to XCom for 'socks_proxy' as get_socks_proxy=False.")
|
|
||||||
|
|
||||||
|
|
||||||
# Get the original command from the server
|
|
||||||
ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
|
|
||||||
if not ytdlp_cmd:
|
|
||||||
logger.error("No 'ytdlpCommand' attribute found in token data.")
|
|
||||||
raise AirflowException("Required 'ytdlpCommand' not received from service.")
|
|
||||||
|
|
||||||
logger.info(f"Original command received from server: {ytdlp_cmd}")
|
|
||||||
|
|
||||||
# Log example usage command (DO NOT MODIFY the original command here)
|
|
||||||
if info_json_path:
|
|
||||||
# Use double quotes for paths/proxy in example for robustness
|
|
||||||
example_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
|
|
||||||
if socks_proxy:
|
|
||||||
example_cmd += f" --proxy \"{socks_proxy}\""
|
|
||||||
example_cmd += " --verbose --simulate" # Add useful flags for testing
|
|
||||||
logger.info(f"\n--- Example usage with saved info.json ---")
|
|
||||||
logger.info(example_cmd)
|
|
||||||
logger.info(f"(Note: The actual command with tokens/cookies is pushed to XCom as 'ytdlp_command')")
|
|
||||||
latest_json_path = os.path.join(os.path.dirname(info_json_path), 'latest.json')
|
|
||||||
logger.info(f"(You can also use 'latest.json': {latest_json_path})")
|
|
||||||
logger.info(f"-------------------------------------------\n")
|
|
||||||
|
|
||||||
else:
|
|
||||||
logger.info("\n--- Original command pushed to XCom ('ytdlp_command') ---")
|
|
||||||
if socks_proxy:
|
|
||||||
logger.info(f"Use the extracted proxy '{socks_proxy}' (pushed to XCom if store_socks_proxy=True) with the --proxy flag.")
|
|
||||||
logger.info("Add --verbose and --simulate flags for testing the command.")
|
|
||||||
logger.info(f"-------------------------------------------------------\n")
|
|
||||||
|
|
||||||
|
|
||||||
# Push the *original* command to XCom
|
|
||||||
context['task_instance'].xcom_push(key='ytdlp_command', value=ytdlp_cmd)
|
|
||||||
logger.info(f"Pushed original command to XCom key 'ytdlp_command'.")
|
|
||||||
|
|
||||||
# Note: Returning ytdlp_cmd below implicitly pushes the same value
|
|
||||||
# to XCom under the key 'return_value'. Downstream tasks should
|
|
||||||
# preferably use the explicitly pushed 'ytdlp_command' key for clarity.
|
|
||||||
return ytdlp_cmd # Return the original command
|
|
||||||
|
|
||||||
except AirflowException as e: # Catch AirflowExceptions raised explicitly in the code above
|
|
||||||
logger.error(f"Operation failed due to AirflowException: {e}")
|
|
||||||
raise # Re-raise AirflowExceptions to ensure task failure
|
|
||||||
except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already wrapped
|
|
||||||
logger.error(f"Unhandled Thrift/Service error: {e}", exc_info=True) # Add traceback for context
|
|
||||||
raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException
|
|
||||||
except Exception as e: # General catch-all for truly unexpected errors
|
|
||||||
# Log with traceback for unexpected errors
|
|
||||||
logger.error(f"Caught unexpected error in YtdlpOpsOperator: {e}", exc_info=True)
|
|
||||||
# Ensure any unexpected error explicitly fails the task with AirflowException
|
|
||||||
raise AirflowException(f"Unexpected error caused task failure: {e}")
|
|
||||||
finally:
|
|
||||||
if transport and transport.isOpen(): # Check if transport exists and is open before closing
|
|
||||||
logger.info("Closing Thrift transport.")
|
|
||||||
transport.close()
|
|
||||||
|
|
||||||
# --- Helper Methods ---
|
|
||||||
|
|
||||||
def _get_info_json(self, token_data):
|
|
||||||
"""Safely extracts infoJson from token data."""
|
|
||||||
info_json = getattr(token_data, 'infoJson', None)
|
|
||||||
if info_json:
|
|
||||||
logger.debug("Extracted infoJson from token data.")
|
|
||||||
else:
|
|
||||||
logger.debug("No infoJson attribute found in token data.")
|
|
||||||
return info_json
|
|
||||||
|
|
||||||
def _is_valid_json(self, json_str):
|
|
||||||
"""Checks if a string is valid JSON."""
|
|
||||||
if not json_str or not isinstance(json_str, str):
|
|
||||||
logger.debug("Input is not a non-empty string, considered invalid JSON.")
|
|
||||||
return False
|
|
||||||
try:
|
|
||||||
json.loads(json_str)
|
|
||||||
logger.debug("JSON string validation successful.")
|
|
||||||
return True
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.warning(f"JSON validation failed: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _save_info_json(self, context, info_json):
|
|
||||||
"""Saves info_json to a file, handling directory creation and logging. Returns the path on success, None on failure."""
|
|
||||||
try:
|
|
||||||
# Get URL from params/context for video ID extraction
|
|
||||||
url_param = context.get('params', {}).get('url', self.url)
|
|
||||||
video_id = self._extract_video_id(url_param) # Use internal helper
|
|
||||||
|
|
||||||
# Render the info_json_dir template
|
|
||||||
save_dir_template = self.info_json_dir or "." # Default to current dir if template is None or empty string
|
|
||||||
save_dir = self.render_template(save_dir_template, context)
|
|
||||||
if not save_dir: # Handle case where template renders to empty string
|
|
||||||
logger.warning(f"Rendered info_json_dir template '{save_dir_template}' resulted in an empty path. Defaulting to '.'")
|
|
||||||
save_dir = "."
|
|
||||||
logger.info(f"Target directory for info.json (rendered): {save_dir}")
|
|
||||||
|
|
||||||
# Ensure directory exists
|
|
||||||
try:
|
|
||||||
os.makedirs(save_dir, exist_ok=True)
|
|
||||||
logger.info(f"Ensured directory exists: {save_dir}")
|
|
||||||
except OSError as e:
|
|
||||||
logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.")
|
|
||||||
return None # Indicate failure
|
|
||||||
|
|
||||||
# Construct filename (using potentially overridden account_id)
|
|
||||||
account_id_param = context.get('params', {}).get('account_id', self.account_id)
|
|
||||||
timestamp = int(time.time())
|
|
||||||
base_filename = f"info_{video_id}_{account_id_param}_{timestamp}.json" if video_id else f"info_{account_id_param}_{timestamp}.json"
|
|
||||||
info_json_path = os.path.join(save_dir, base_filename)
|
|
||||||
latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy
|
|
||||||
|
|
||||||
# Write to timestamped file
|
|
||||||
try:
|
|
||||||
logger.info(f"Writing info.json content (received from service) to {info_json_path}...")
|
|
||||||
with open(info_json_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(info_json)
|
|
||||||
logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}")
|
|
||||||
except IOError as e:
|
|
||||||
logger.error(f"Failed to write info.json to {info_json_path}: {e}")
|
|
||||||
return None # Indicate failure
|
|
||||||
|
|
||||||
# Write to latest.json (overwrite) - best effort
|
|
||||||
try:
|
|
||||||
with open(latest_json_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(info_json)
|
|
||||||
logger.info(f"Updated latest.json file: {latest_json_path}")
|
|
||||||
except IOError as e:
|
|
||||||
# Log warning but don't fail the whole save if only latest.json fails
|
|
||||||
logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}")
|
|
||||||
|
|
||||||
return info_json_path # Return path on success (even if latest.json failed)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True)
|
|
||||||
return None # Indicate failure
|
|
||||||
|
|
||||||
def _extract_video_id(self, url):
|
|
||||||
"""Extracts YouTube video ID from URL (internal helper)."""
|
|
||||||
if not url or not isinstance(url, str):
|
|
||||||
logger.debug("URL is empty or not a string, cannot extract video ID.")
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
# Basic extraction logic (can be enhanced for more URL types)
|
|
||||||
video_id = None
|
|
||||||
if 'youtube.com/watch?v=' in url:
|
|
||||||
video_id = url.split('v=')[1].split('&')[0]
|
|
||||||
elif 'youtu.be/' in url:
|
|
||||||
video_id = url.split('youtu.be/')[1].split('?')[0]
|
|
||||||
|
|
||||||
# Ensure it looks like a video ID (typically 11 chars, but can vary)
|
|
||||||
if video_id and len(video_id) >= 11:
|
|
||||||
video_id = video_id[:11] # Take first 11 chars as standard ID length
|
|
||||||
logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
|
|
||||||
return video_id
|
|
||||||
else:
|
|
||||||
logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Python Callables for Tasks
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
def display_token_info(**context):
|
|
||||||
"""Displays token info from XCom, parses info.json, and logs example commands."""
|
|
||||||
ti = context['task_instance']
|
|
||||||
logger.info("Starting display_token_info task.")
|
|
||||||
|
|
||||||
# Pull data from XCom (provide default values)
|
|
||||||
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
|
|
||||||
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
|
|
||||||
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
|
|
||||||
|
|
||||||
logger.info("\n=== Pulled Token Information from XCom ===")
|
|
||||||
logger.info(f"Info.json path: {info_json_path or 'Not found/Not saved'}")
|
|
||||||
logger.info(f"SOCKS Proxy: {socks_proxy or 'Not found/Not extracted'}")
|
|
||||||
logger.info(f"Original yt-dlp command (with tokens): {ytdlp_command or 'Not found'}")
|
|
||||||
|
|
||||||
result = {
|
|
||||||
'info_path': info_json_path,
|
|
||||||
'proxy': socks_proxy,
|
|
||||||
'ytdlp_command': ytdlp_command,
|
|
||||||
'video_info': None,
|
|
||||||
'commands': {},
|
|
||||||
'error': None
|
|
||||||
}
|
|
||||||
|
|
||||||
if info_json_path and os.path.exists(info_json_path):
|
|
||||||
logger.info(f"\n=== Processing Video Information from: {info_json_path} ===")
|
|
||||||
try:
|
|
||||||
with open(info_json_path, 'r', encoding='utf-8') as f:
|
|
||||||
info = json.load(f)
|
|
||||||
|
|
||||||
# Extract and log basic video info safely
|
|
||||||
title = info.get('title', 'Unknown Title')
|
|
||||||
uploader = info.get('uploader', 'Unknown Author')
|
|
||||||
duration = info.get('duration_string', 'Unknown Length')
|
|
||||||
upload_date_str = info.get('upload_date') # Format: YYYYMMDD
|
|
||||||
upload_date_formatted = 'Unknown Date'
|
|
||||||
if upload_date_str:
|
|
||||||
try:
|
|
||||||
# Validate format before parsing
|
|
||||||
if len(upload_date_str) == 8 and upload_date_str.isdigit():
|
|
||||||
upload_date_formatted = datetime.strptime(upload_date_str, '%Y%m%d').strftime('%Y-%m-%d')
|
|
||||||
else:
|
|
||||||
logger.warning(f"Upload date '{upload_date_str}' is not in YYYYMMDD format.")
|
|
||||||
except ValueError:
|
|
||||||
logger.warning(f"Could not parse upload_date '{upload_date_str}'")
|
|
||||||
|
|
||||||
result['video_info'] = {
|
|
||||||
'title': title,
|
|
||||||
'uploader': uploader,
|
|
||||||
'upload_date': upload_date_formatted, # Store formatted date
|
|
||||||
'duration': duration
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info(f"Title: {title}")
|
|
||||||
logger.info(f"Author: {uploader}")
|
|
||||||
logger.info(f"Date: {upload_date_formatted}")
|
|
||||||
logger.info(f"Length: {duration}")
|
|
||||||
|
|
||||||
logger.info("\n=== Example yt-dlp Commands (using saved info.json) ===")
|
|
||||||
base_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
|
|
||||||
if socks_proxy:
|
|
||||||
base_cmd += f" --proxy \"{socks_proxy}\""
|
|
||||||
|
|
||||||
# Command to list formats
|
|
||||||
format_cmd = f"{base_cmd} -F"
|
|
||||||
result['commands']['format'] = format_cmd
|
|
||||||
logger.info(f"List formats command: {format_cmd}")
|
|
||||||
|
|
||||||
# Execute and log the format listing command
|
|
||||||
logger.info("\n--- Executing Format List Command ---")
|
|
||||||
try:
|
|
||||||
# Use os.popen for simplicity, capture output
|
|
||||||
logger.info(f"Running: {format_cmd}")
|
|
||||||
format_output = os.popen(format_cmd).read()
|
|
||||||
logger.info("--- Format List Output ---")
|
|
||||||
logger.info(format_output)
|
|
||||||
logger.info("--------------------------")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error executing format command: {e}")
|
|
||||||
|
|
||||||
# Command to simulate download
|
|
||||||
simulate_cmd = f"{base_cmd} --simulate --verbose" # Add verbose for more info
|
|
||||||
result['commands']['simulate'] = simulate_cmd
|
|
||||||
logger.info(f"Simulate download command: {simulate_cmd}")
|
|
||||||
|
|
||||||
# Execute and log the simulation command
|
|
||||||
logger.info("\n--- Executing Simulation Command ---")
|
|
||||||
try:
|
|
||||||
logger.info(f"Running: {simulate_cmd}")
|
|
||||||
simulate_output = os.popen(simulate_cmd).read()
|
|
||||||
logger.info("--- Simulation Output ---")
|
|
||||||
logger.info(simulate_output)
|
|
||||||
logger.info("-------------------------")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error executing simulation command: {e}")
|
|
||||||
|
|
||||||
# Basic download command
|
|
||||||
download_cmd = base_cmd
|
|
||||||
result['commands']['download_base'] = download_cmd
|
|
||||||
logger.info(f"Base download command (add format selection, output path): {download_cmd}")
|
|
||||||
|
|
||||||
# Push generated example commands to XCom for potential downstream use
|
|
||||||
# ti.xcom_push(key='format_cmd', value=format_cmd) # Removed as requested
|
|
||||||
# ti.xcom_push(key='simulate_cmd', value=simulate_cmd) # Removed as requested
|
|
||||||
ti.xcom_push(key='download_cmd', value=download_cmd)
|
|
||||||
logger.info(f"Pushed key 'download_cmd' to XCom with value: {download_cmd}")
|
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
error_msg = f"Failed to parse info.json file '{info_json_path}': {e}"
|
|
||||||
logger.error(error_msg)
|
|
||||||
result['error'] = error_msg
|
|
||||||
except FileNotFoundError:
|
|
||||||
error_msg = f"Info.json file not found at path: {info_json_path}"
|
|
||||||
logger.error(error_msg)
|
|
||||||
result['error'] = error_msg
|
|
||||||
except Exception as e:
|
|
||||||
error_msg = f"Error processing info.json file '{info_json_path}': {str(e)}"
|
|
||||||
logger.error(error_msg, exc_info=True)
|
|
||||||
result['error'] = error_msg
|
|
||||||
elif info_json_path:
|
|
||||||
error_msg = f"Info.json path provided ('{info_json_path}') but file does not exist."
|
|
||||||
logger.warning(error_msg)
|
|
||||||
result['error'] = error_msg
|
|
||||||
else:
|
|
||||||
logger.warning("No info.json path found in XCom. Cannot display video details or generate example commands.")
|
|
||||||
result['error'] = "Info.json path not available."
|
|
||||||
|
|
||||||
logger.info("Finished display_token_info task.")
|
|
||||||
# Return the collected information (useful if used as a PythonOperator return value)
|
|
||||||
return json.dumps(result) # Return as JSON string for XCom compatibility if needed
|
|
||||||
|
|
||||||
|
|
||||||
def store_token_info(**context):
|
|
||||||
"""Stores retrieved token information (command, proxy, info.json) in Redis."""
|
|
||||||
ti = context['task_instance']
|
|
||||||
# Use the redis_conn_id defined in the operator/DAG params if possible, else default
|
|
||||||
redis_conn_id = context['params'].get('redis_conn_id', 'redis_default')
|
|
||||||
redis_hook = RedisHook(redis_conn_id=redis_conn_id)
|
|
||||||
logger.info(f"Starting store_token_info task using Redis connection '{redis_conn_id}'.")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Pull necessary data from XCom and context
|
|
||||||
url = context['params'].get('url')
|
|
||||||
if not url:
|
|
||||||
# Attempt to get URL from DAG run conf as fallback
|
|
||||||
url = context.get('dag_run', {}).conf.get('url')
|
|
||||||
if not url:
|
|
||||||
raise ValueError("URL parameter is missing in context['params'] and dag_run.conf")
|
|
||||||
logger.warning("URL parameter missing in context['params'], using URL from dag_run.conf.")
|
|
||||||
|
|
||||||
|
|
||||||
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
|
|
||||||
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') or '' # Default to empty string if None
|
|
||||||
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
|
|
||||||
|
|
||||||
if not ytdlp_command:
|
|
||||||
logger.warning("ytdlp_command not found in XCom. Storing empty value.")
|
|
||||||
ytdlp_command = '' # Store empty if not found
|
|
||||||
|
|
||||||
# Construct the base command using info.json
|
|
||||||
ytdlp_command_base = ''
|
|
||||||
if info_json_path and os.path.exists(info_json_path):
|
|
||||||
ytdlp_command_base = f"yt-dlp --load-info-json \"{info_json_path}\""
|
|
||||||
logger.info(f"Constructed base command: {ytdlp_command_base}")
|
|
||||||
else:
|
|
||||||
logger.warning("Cannot construct base command: info_json_path not valid.")
|
|
||||||
|
|
||||||
# Construct the command with tokens and proxy
|
|
||||||
ytdlp_command_tokens = ytdlp_command # Start with original command from server
|
|
||||||
if socks_proxy:
|
|
||||||
ytdlp_command_tokens += f" --proxy \"{socks_proxy}\""
|
|
||||||
logger.info("Appended proxy to token command.")
|
|
||||||
|
|
||||||
data_to_store = {
|
|
||||||
'url': url,
|
|
||||||
'ytdlp_command': ytdlp_command_base, # Store the base command
|
|
||||||
'proxy': socks_proxy,
|
|
||||||
'info_json_path': info_json_path or '' # Store path even if None/empty
|
|
||||||
# 'info_json' will be added below
|
|
||||||
}
|
|
||||||
|
|
||||||
# Read info.json content if path exists
|
|
||||||
info_json_content = None
|
|
||||||
if info_json_path and os.path.exists(info_json_path):
|
|
||||||
try:
|
|
||||||
with open(info_json_path, 'r', encoding='utf-8') as f:
|
|
||||||
# Read and immediately validate JSON structure before storing
|
|
||||||
info_json_content = json.load(f)
|
|
||||||
# Store the validated JSON as a string
|
|
||||||
data_to_store['info_json'] = json.dumps(info_json_content)
|
|
||||||
logger.info(f"Read and validated info.json content from: {info_json_path}")
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.error(f"Failed to parse info.json file '{info_json_path}' as JSON: {e}. Storing empty content.")
|
|
||||||
data_to_store['info_json'] = '' # Store empty string on parse error
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to read info.json file '{info_json_path}': {e}. Storing empty content.")
|
|
||||||
data_to_store['info_json'] = '' # Store empty string on other read errors
|
|
||||||
else:
|
|
||||||
logger.warning(f"info_json_path ('{info_json_path}') not found or invalid. Storing without info_json content.")
|
|
||||||
data_to_store['info_json'] = '' # Store empty string if no path
|
|
||||||
|
|
||||||
# Determine Redis key using video ID
|
|
||||||
# Use the same helper method as the operator for consistency
|
|
||||||
# Need an instance or static method call. Let's make _extract_video_id static temporarily
|
|
||||||
# Or instantiate the operator just for this - less ideal.
|
|
||||||
# Simplest: Re-implement or assume utils.
|
|
||||||
# Re-implementing basic logic here for simplicity:
|
|
||||||
video_id = None
|
|
||||||
try:
|
|
||||||
if 'youtube.com/watch?v=' in url:
|
|
||||||
video_id = url.split('v=')[1].split('&')[0][:11]
|
|
||||||
elif 'youtu.be/' in url:
|
|
||||||
video_id = url.split('youtu.be/')[1].split('?')[0][:11]
|
|
||||||
except Exception:
|
|
||||||
pass # Ignore errors in ID extraction for key generation
|
|
||||||
redis_key = f"token_info:{video_id or 'unknown'}"
|
|
||||||
logger.info(f"Determined Redis key: {redis_key}")
|
|
||||||
|
|
||||||
# Store data in Redis hash
|
|
||||||
# Log presence/absence rather than full content for potentially large fields
|
|
||||||
logger.info(f"Data to store in Redis key '{redis_key}': "
|
|
||||||
f"URL='{data_to_store['url']}', "
|
|
||||||
f"Command={'<present>' if data_to_store['ytdlp_command'] else '<empty>'}, "
|
|
||||||
f"Proxy='{data_to_store['proxy'] or '<empty>'}', "
|
|
||||||
f"Path='{data_to_store['info_json_path'] or '<empty>'}', "
|
|
||||||
f"JSON Content={'<present>' if data_to_store.get('info_json') else '<empty>'}")
|
|
||||||
|
|
||||||
with redis_hook.get_conn() as redis_client:
|
|
||||||
# Extract video ID from URL
|
|
||||||
video_id = None
|
|
||||||
try:
|
|
||||||
if 'youtube.com/watch?v=' in url:
|
|
||||||
video_id = url.split('v=')[1].split('&')[0][:11]
|
|
||||||
elif 'youtu.be/' in url:
|
|
||||||
video_id = url.split('youtu.be/')[1].split('?')[0][:11]
|
|
||||||
except Exception:
|
|
||||||
pass # Ignore errors in ID extraction for key generation
|
|
||||||
|
|
||||||
# Use video ID as part of the Redis key
|
|
||||||
redis_key = f"token_info:{video_id or 'unknown'}"
|
|
||||||
logger.info(f"Determined Redis key: {redis_key}")
|
|
||||||
|
|
||||||
# Store data in Redis hash
|
|
||||||
# Add video_id, timestamp, and the constructed ytdlp_command_tokens
|
|
||||||
data_to_store['video_id'] = video_id or 'unknown'
|
|
||||||
data_to_store['timestamp'] = int(time.time())
|
|
||||||
data_to_store['ytdlp_command_tokens'] = ytdlp_command_tokens # Store the original token command
|
|
||||||
|
|
||||||
# Log fields being stored
|
|
||||||
log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
|
|
||||||
logger.info(f"Storing in Redis key '{redis_key}': {log_data}")
|
|
||||||
|
|
||||||
redis_client.hset(redis_key, mapping=data_to_store)
|
|
||||||
# Set expiration (e.g., 24 hours = 86400 seconds)
|
|
||||||
redis_client.expire(redis_key, 86400)
|
|
||||||
logger.info(f"Successfully stored token info in Redis key '{redis_key}' with 24h expiration.")
|
|
||||||
# Log the final stored data again for clarity
|
|
||||||
final_log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
|
|
||||||
logger.info(f"--- Final Data Stored in Redis Key '{redis_key}' ---")
|
|
||||||
logger.info(final_log_data)
|
|
||||||
logger.info("----------------------------------------------------")
|
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to store token info in Redis: {e}", exc_info=True)
|
|
||||||
# Re-raise as AirflowException to fail the task
|
|
||||||
raise AirflowException(f"Failed to store token info in Redis: {e}")
|
|
||||||
|
|
||||||
logger.info("Finished store_token_info task.")
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# DAG Definition
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
# Update default_args to match ytdlp_client_dag.py structure
|
|
||||||
default_args = {
|
|
||||||
'owner': 'airflow',
|
|
||||||
'depends_on_past': False,
|
|
||||||
'email_on_failure': False, # Match reference DAG
|
|
||||||
'email_on_retry': False, # Match reference DAG
|
|
||||||
'retries': 1, # Default task retries
|
|
||||||
'retry_delay': timedelta(minutes=5), # Standard task retry delay
|
|
||||||
'start_date': days_ago(1) # Best practice start date
|
|
||||||
}
|
|
||||||
|
|
||||||
# Update DAG definition
|
|
||||||
with DAG(
|
|
||||||
dag_id='ytdlp_client_dag_v2.1',
|
|
||||||
default_args=default_args,
|
|
||||||
schedule_interval=None, # Manually triggered DAG
|
|
||||||
catchup=False, # Don't run for past missed schedules
|
|
||||||
description='DAG for YTDLP operations using Thrift client (V2 - Refactored)', # Updated description
|
|
||||||
tags=['ytdlp', 'thrift', 'client', 'v2'], # Updated tags for better filtering
|
|
||||||
params={
|
|
||||||
# Define DAG parameters with defaults and types for UI clarity
|
|
||||||
'url': Param('https://www.youtube.com/watch?v=sOlTX9uxUtM', type=["null", "string"], description="Required: The video URL to process."), # Default URL
|
|
||||||
'redis_enabled': Param(False, type="boolean", description="Use Redis for service discovery? If False, uses service_ip/port."), # Default to direct connection
|
|
||||||
'service_ip': Param('85.192.30.55', type="string", description="Service IP if redis_enabled=False."), # Default service IP
|
|
||||||
'service_port': Param(9090, type="integer", description="Service port if redis_enabled=False."), # Default service port
|
|
||||||
'account_id': Param('account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09', type="string", description="Account ID for Redis lookup or direct call."), # Updated default account_id
|
|
||||||
'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
|
|
||||||
# Use Airflow Variable for downloads directory, matching reference DAG structure
|
|
||||||
'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json. Uses Airflow Variable 'DOWNLOADS_TEMP' or default.")
|
|
||||||
}
|
|
||||||
) as dag:
|
|
||||||
|
|
||||||
# Define Tasks
|
|
||||||
|
|
||||||
get_token = YtdlpOpsOperator(
|
|
||||||
task_id='get_token',
|
|
||||||
# Pass templated parameters from DAG run config
|
|
||||||
url="{{ params.url }}",
|
|
||||||
redis_enabled="{{ params.redis_enabled }}",
|
|
||||||
service_ip="{{ params.service_ip }}",
|
|
||||||
service_port="{{ params.service_port }}",
|
|
||||||
account_id="{{ params.account_id }}",
|
|
||||||
save_info_json=True,
|
|
||||||
info_json_dir="{{ params.info_json_dir }}",
|
|
||||||
get_socks_proxy=True,
|
|
||||||
store_socks_proxy=True,
|
|
||||||
timeout="{{ params.timeout }}",
|
|
||||||
retries=MAX_RETRIES, # Operator-specific retries if needed, else use DAG default
|
|
||||||
retry_delay=RETRY_DELAY, # Operator-specific delay if needed
|
|
||||||
# Add callbacks for logging success/failure, similar to reference DAG
|
|
||||||
on_failure_callback=lambda context: logger.error(f"Task {context['task_instance_key_str']} failed."),
|
|
||||||
on_success_callback=lambda context: logger.info(f"Task {context['task_instance_key_str']} succeeded.")
|
|
||||||
)
|
|
||||||
# Add task documentation (visible in Airflow UI)
|
|
||||||
get_token.doc_md = """
|
|
||||||
### Get Token Task
|
|
||||||
Connects to the YTDLP Thrift service (either directly or via Redis discovery)
|
|
||||||
to retrieve an authentication token and video metadata (info.json).
|
|
||||||
|
|
||||||
**Pushes to XCom:**
|
|
||||||
- `info_json_path`: Path to the saved info.json file (or None if not saved/failed).
|
|
||||||
- `socks_proxy`: The extracted SOCKS proxy string (or None if not requested/found).
|
|
||||||
- `ytdlp_command`: The original command string received from the server (contains tokens/cookies).
|
|
||||||
|
|
||||||
- Uses parameters defined in the DAG run configuration.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Optional: Add a task to explicitly check XComs for debugging (like in reference DAG)
|
|
||||||
def _check_xcom_callable(**context):
|
|
||||||
"""Logs XCom values pushed by the get_token task."""
|
|
||||||
ti = context['task_instance']
|
|
||||||
logger.info("--- Checking XCom values pushed by get_token ---")
|
|
||||||
keys_to_check = ['info_json_path', 'socks_proxy', 'ytdlp_command']
|
|
||||||
xcom_values = {}
|
|
||||||
for key in keys_to_check:
|
|
||||||
value = ti.xcom_pull(task_ids='get_token', key=key)
|
|
||||||
xcom_values[key] = value
|
|
||||||
# Avoid logging potentially sensitive command details fully in production
|
|
||||||
if key == 'ytdlp_command' and value:
|
|
||||||
log_value = f"{value[:50]}..." # Log truncated command
|
|
||||||
else:
|
|
||||||
log_value = value
|
|
||||||
logger.info(f"XCom key='{key}': {log_value}")
|
|
||||||
logger.info("----------------------------------------------")
|
|
||||||
return xcom_values # Return values for potential future use
|
|
||||||
|
|
||||||
check_xcom_task = PythonOperator(
|
|
||||||
task_id='check_xcom_after_get_token',
|
|
||||||
python_callable=_check_xcom_callable,
|
|
||||||
)
|
|
||||||
check_xcom_task.doc_md = "Logs the values pushed to XCom by the 'get_token' task for debugging purposes."
|
|
||||||
|
|
||||||
display_info = PythonOperator(
|
|
||||||
task_id='display_token_info',
|
|
||||||
python_callable=display_token_info,
|
|
||||||
trigger_rule='all_success'
|
|
||||||
)
|
|
||||||
display_info.doc_md = """
|
|
||||||
### Display Token Info Task
|
|
||||||
Pulls information from XCom, parses the `info.json` file (if available),
|
|
||||||
logs video details, and generates example `yt-dlp` commands.
|
|
||||||
|
|
||||||
**Pulls from XCom (task_id='get_token'):**
|
|
||||||
- `info_json_path`
|
|
||||||
- `socks_proxy`
|
|
||||||
- `ytdlp_command`
|
|
||||||
|
|
||||||
**Pushes to XCom:**
|
|
||||||
- `download_cmd`: Base command using `--load-info-json` (user needs to add format/output).
|
|
||||||
"""
|
|
||||||
|
|
||||||
store_info = PythonOperator(
|
|
||||||
task_id='store_token_info', # Use consistent task ID naming
|
|
||||||
python_callable=store_token_info,
|
|
||||||
)
|
|
||||||
store_info.doc_md = """
|
|
||||||
### Store Token Info Task
|
|
||||||
Pulls information from XCom and DAG parameters, reads the `info.json` content,
|
|
||||||
and stores relevant data in a Redis hash.
|
|
||||||
|
|
||||||
**Pulls from XCom (task_id='get_token'):**
|
|
||||||
- `ytdlp_command`
|
|
||||||
- `socks_proxy`
|
|
||||||
- `info_json_path`
|
|
||||||
|
|
||||||
**Pulls from DAG context:**
|
|
||||||
- `params['url']` (or `dag_run.conf['url']`)
|
|
||||||
|
|
||||||
**Stores in Redis Hash (key: `token_info:<video_id>`):**
|
|
||||||
- `url`: The video URL.
|
|
||||||
- `ytdlp_command`: Base command using `--load-info-json`.
|
|
||||||
- `proxy`: The SOCKS proxy string.
|
|
||||||
- `info_json_path`: Path to the saved info.json file.
|
|
||||||
- `info_json`: The full content of the info.json file (as a JSON string).
|
|
||||||
- `video_id`: Extracted video ID.
|
|
||||||
- `timestamp`: Unix timestamp of storage.
|
|
||||||
- `ytdlp_command_tokens`: The original command string from the server (contains tokens/cookies).
|
|
||||||
|
|
||||||
Sets a 24-hour expiration on the Redis key.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Define task dependencies matching the reference DAG structure
|
|
||||||
get_token >> check_xcom_task >> display_info >> store_info
|
|
||||||
@ -1,179 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# vim:fenc=utf-8
|
|
||||||
#
|
|
||||||
# Copyright © 2024 rl <rl@rlmbp>
|
|
||||||
#
|
|
||||||
# Distributed under terms of the MIT license.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Airflow DAG for manually checking the status (type and size) of a specific Redis key used by YTDLP queues.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from airflow import DAG
|
|
||||||
from airflow.exceptions import AirflowException
|
|
||||||
from airflow.models.param import Param
|
|
||||||
from airflow.operators.python import PythonOperator
|
|
||||||
from airflow.providers.redis.hooks.redis import RedisHook
|
|
||||||
from airflow.utils.dates import days_ago
|
|
||||||
from datetime import datetime, timedelta, timezone
|
|
||||||
import logging
|
|
||||||
import json
|
|
||||||
import redis # Import redis exceptions if needed
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Default settings
|
|
||||||
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
|
||||||
DEFAULT_QUEUE_BASE_NAME = 'video_queue'
|
|
||||||
DEFAULT_MAX_ITEMS_TO_LIST = 25
|
|
||||||
|
|
||||||
# Import utility functions
|
|
||||||
from utils.redis_utils import _get_redis_client
|
|
||||||
|
|
||||||
# --- Python Callable for Check and List Task ---
|
|
||||||
|
|
||||||
def check_and_list_queue_callable(**context):
|
|
||||||
"""Checks the type and size of a Redis key and lists its recent contents."""
|
|
||||||
params = context['params']
|
|
||||||
redis_conn_id = params['redis_conn_id']
|
|
||||||
# queue_suffix is passed from the PythonOperator's op_kwargs, which are available in the context
|
|
||||||
queue_suffix = context['queue_suffix']
|
|
||||||
queue_name = params.get('queue_name', DEFAULT_QUEUE_BASE_NAME)
|
|
||||||
queue_to_check = f"{queue_name}{queue_suffix}"
|
|
||||||
max_items = int(params.get('max_items_to_list', DEFAULT_MAX_ITEMS_TO_LIST))
|
|
||||||
|
|
||||||
logger.info(f"--- Checking Status and Contents of Redis Key: '{queue_to_check}' ---")
|
|
||||||
logger.info(f"Using connection '{redis_conn_id}', listing up to {max_items} items.")
|
|
||||||
|
|
||||||
try:
|
|
||||||
redis_client = _get_redis_client(redis_conn_id)
|
|
||||||
key_type_bytes = redis_client.type(queue_to_check)
|
|
||||||
key_type = key_type_bytes.decode('utf-8')
|
|
||||||
|
|
||||||
if key_type == 'list':
|
|
||||||
list_length = redis_client.llen(queue_to_check)
|
|
||||||
logger.info(f"Redis key '{queue_to_check}' is a LIST with {list_length} items.")
|
|
||||||
if list_length > 0:
|
|
||||||
items_to_fetch = min(max_items, list_length)
|
|
||||||
# lrange with negative indices gets items from the end (most recent for rpush)
|
|
||||||
contents_bytes = redis_client.lrange(queue_to_check, -items_to_fetch, -1)
|
|
||||||
contents = [item.decode('utf-8') for item in contents_bytes]
|
|
||||||
contents.reverse() # Show most recent first
|
|
||||||
logger.info(f"--- Showing most recent {len(contents)} of {list_length} items ---")
|
|
||||||
for i, item in enumerate(contents):
|
|
||||||
logger.info(f" [recent_{i}]: {item}")
|
|
||||||
if list_length > len(contents):
|
|
||||||
logger.info(f" ... ({list_length - len(contents)} older items not shown)")
|
|
||||||
logger.info(f"--- End of List Contents ---")
|
|
||||||
|
|
||||||
elif key_type == 'hash':
|
|
||||||
hash_size = redis_client.hlen(queue_to_check)
|
|
||||||
logger.info(f"Redis key '{queue_to_check}' is a HASH with {hash_size} fields.")
|
|
||||||
if hash_size > 0:
|
|
||||||
logger.info(f"--- Showing a sample of up to {max_items} fields ---")
|
|
||||||
item_count = 0
|
|
||||||
# Using hscan_iter to safely iterate over hash fields, count is a hint
|
|
||||||
for field_bytes, value_bytes in redis_client.hscan_iter(queue_to_check, count=max_items):
|
|
||||||
if item_count >= max_items:
|
|
||||||
logger.info(f" ... (stopped listing after {max_items} items of {hash_size})")
|
|
||||||
break
|
|
||||||
field = field_bytes.decode('utf-8')
|
|
||||||
value = value_bytes.decode('utf-8')
|
|
||||||
# Try to pretty-print if value is JSON
|
|
||||||
try:
|
|
||||||
parsed_value = json.loads(value)
|
|
||||||
# Check for timestamp to show age
|
|
||||||
timestamp = parsed_value.get('end_time') or parsed_value.get('start_time')
|
|
||||||
age_str = ""
|
|
||||||
if timestamp:
|
|
||||||
age_seconds = (datetime.now(timezone.utc) - datetime.fromtimestamp(timestamp, timezone.utc)).total_seconds()
|
|
||||||
age_str = f" (age: {timedelta(seconds=age_seconds)})"
|
|
||||||
|
|
||||||
pretty_value = json.dumps(parsed_value, indent=2)
|
|
||||||
logger.info(f" Field '{field}'{age_str}:\n{pretty_value}")
|
|
||||||
except (json.JSONDecodeError, TypeError):
|
|
||||||
logger.info(f" Field '{field}': {value}")
|
|
||||||
item_count += 1
|
|
||||||
logger.info(f"--- End of Hash Contents ---")
|
|
||||||
|
|
||||||
elif key_type == 'none':
|
|
||||||
logger.info(f"Redis key '{queue_to_check}' does not exist.")
|
|
||||||
else:
|
|
||||||
logger.info(f"Redis key '{queue_to_check}' is of type '{key_type}'. Listing contents for this type is not implemented.")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to check/list contents of Redis key '{queue_to_check}': {e}", exc_info=True)
|
|
||||||
raise AirflowException(f"Failed to process Redis key: {e}")
|
|
||||||
|
|
||||||
# --- DAG Definition ---
|
|
||||||
default_args = {
|
|
||||||
'owner': 'airflow',
|
|
||||||
'depends_on_past': False,
|
|
||||||
'email_on_failure': False,
|
|
||||||
'email_on_retry': False,
|
|
||||||
'retries': 0, # No retries for a manual check/list operation
|
|
||||||
'start_date': days_ago(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
with DAG(
|
|
||||||
dag_id='ytdlp_mgmt_queues_check_status',
|
|
||||||
default_args=default_args,
|
|
||||||
schedule_interval=None, # Manually triggered
|
|
||||||
catchup=False,
|
|
||||||
description='Manually check the status and recent items of all YTDLP Redis queues for a given base name.',
|
|
||||||
tags=['ytdlp', 'queue', 'management', 'redis', 'manual', 'status', 'list'],
|
|
||||||
params={
|
|
||||||
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
|
|
||||||
'queue_name': Param(
|
|
||||||
DEFAULT_QUEUE_BASE_NAME,
|
|
||||||
type="string",
|
|
||||||
description="Base name for the Redis queues (e.g., 'video_queue')."
|
|
||||||
),
|
|
||||||
'max_items_to_list': Param(DEFAULT_MAX_ITEMS_TO_LIST, type="integer", description="Maximum number of recent items/fields to list from each queue."),
|
|
||||||
}
|
|
||||||
) as dag:
|
|
||||||
|
|
||||||
check_inbox_queue = PythonOperator(
|
|
||||||
task_id='check_inbox_queue',
|
|
||||||
python_callable=check_and_list_queue_callable,
|
|
||||||
op_kwargs={'queue_suffix': '_inbox'},
|
|
||||||
)
|
|
||||||
check_inbox_queue.doc_md = """
|
|
||||||
### Check Inbox Queue (`_inbox`)
|
|
||||||
Checks the status and lists the most recent URLs waiting to be processed.
|
|
||||||
The full queue name is `{{ params.queue_name }}_inbox`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
check_progress_queue = PythonOperator(
|
|
||||||
task_id='check_progress_queue',
|
|
||||||
python_callable=check_and_list_queue_callable,
|
|
||||||
op_kwargs={'queue_suffix': '_progress'},
|
|
||||||
)
|
|
||||||
check_progress_queue.doc_md = """
|
|
||||||
### Check Progress Queue (`_progress`)
|
|
||||||
Checks the status and lists a sample of URLs currently being processed.
|
|
||||||
The full queue name is `{{ params.queue_name }}_progress`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
check_result_queue = PythonOperator(
|
|
||||||
task_id='check_result_queue',
|
|
||||||
python_callable=check_and_list_queue_callable,
|
|
||||||
op_kwargs={'queue_suffix': '_result'},
|
|
||||||
)
|
|
||||||
check_result_queue.doc_md = """
|
|
||||||
### Check Result Queue (`_result`)
|
|
||||||
Checks the status and lists a sample of successfully processed URLs.
|
|
||||||
The full queue name is `{{ params.queue_name }}_result`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
check_fail_queue = PythonOperator(
|
|
||||||
task_id='check_fail_queue',
|
|
||||||
python_callable=check_and_list_queue_callable,
|
|
||||||
op_kwargs={'queue_suffix': '_fail'},
|
|
||||||
)
|
|
||||||
check_fail_queue.doc_md = """
|
|
||||||
### Check Fail Queue (`_fail`)
|
|
||||||
Checks the status and lists a sample of failed URLs.
|
|
||||||
The full queue name is `{{ params.queue_name }}_fail`.
|
|
||||||
"""
|
|
||||||
@ -1,343 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# vim:fenc=utf-8
|
|
||||||
#
|
|
||||||
# Copyright © 2024 rl <rl@rlmbp>
|
|
||||||
#
|
|
||||||
# Distributed under terms of the MIT license.
|
|
||||||
|
|
||||||
"""
|
|
||||||
DAG for processing a single YouTube URL passed via DAG run configuration.
|
|
||||||
This is the "Worker" part of a Sensor/Worker pattern.
|
|
||||||
This DAG has been refactored to use the TaskFlow API to implement worker affinity,
|
|
||||||
ensuring all tasks for a single URL run on the same machine.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from airflow.decorators import task, task_group
|
|
||||||
from airflow.exceptions import AirflowException, AirflowSkipException
|
|
||||||
from airflow.models import Variable
|
|
||||||
from airflow.models.dag import DAG
|
|
||||||
from airflow.models.param import Param
|
|
||||||
from airflow.models.xcom_arg import XComArg
|
|
||||||
from airflow.operators.dummy import DummyOperator
|
|
||||||
from airflow.operators.bash import BashOperator
|
|
||||||
from airflow.utils.dates import days_ago
|
|
||||||
from airflow.api.common.trigger_dag import trigger_dag
|
|
||||||
from datetime import timedelta, datetime
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
import re
|
|
||||||
import socket
|
|
||||||
import time
|
|
||||||
import traceback
|
|
||||||
import uuid
|
|
||||||
import subprocess
|
|
||||||
import shlex
|
|
||||||
|
|
||||||
# Import utility functions and Thrift modules
|
|
||||||
from utils.redis_utils import _get_redis_client
|
|
||||||
|
|
||||||
# Handle potential import issues with Thrift modules
|
|
||||||
try:
|
|
||||||
from pangramia.yt.common.ttypes import TokenUpdateMode
|
|
||||||
except ImportError as e:
|
|
||||||
logging.warning(f"Could not import TokenUpdateMode from pangramia.yt.common.ttypes: {e}")
|
|
||||||
TokenUpdateMode = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
|
|
||||||
except ImportError as e:
|
|
||||||
logging.warning(f"Could not import PBServiceException/PBUserException from pangramia.yt.exceptions.ttypes: {e}")
|
|
||||||
PBServiceException = Exception
|
|
||||||
PBUserException = Exception
|
|
||||||
|
|
||||||
try:
|
|
||||||
from pangramia.yt.tokens_ops import YTTokenOpService
|
|
||||||
except ImportError as e:
|
|
||||||
logging.warning(f"Could not import YTTokenOpService from pangramia.yt.tokens_ops: {e}")
|
|
||||||
YTTokenOpService = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
from thrift.protocol import TBinaryProtocol
|
|
||||||
from thrift.transport import TSocket, TTransport
|
|
||||||
from thrift.transport.TTransport import TTransportException
|
|
||||||
except ImportError as e:
|
|
||||||
logging.warning(f"Could not import thrift modules: {e}")
|
|
||||||
TBinaryProtocol = None
|
|
||||||
TSocket = None
|
|
||||||
TTransport = None
|
|
||||||
TTransportException = Exception
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Default settings from Airflow Variables or hardcoded fallbacks
|
|
||||||
DEFAULT_QUEUE_NAME = 'video_queue'
|
|
||||||
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
|
||||||
DEFAULT_TIMEOUT = 3600
|
|
||||||
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
|
|
||||||
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
|
|
||||||
|
|
||||||
# The queue is set to a fallback here. The actual worker-specific queue is
|
|
||||||
# assigned just-in-time by the task_instance_mutation_hook in airflow_local_settings.py,
|
|
||||||
# which reads the 'worker_queue' from the DAG run configuration.
|
|
||||||
DEFAULT_ARGS = {
|
|
||||||
'owner': 'airflow',
|
|
||||||
'retries': 0,
|
|
||||||
'queue': 'queue-dl', # Fallback queue. Will be overridden by the policy hook.
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# --- Helper Functions ---
|
|
||||||
|
|
||||||
def _get_thrift_client(host, port, timeout):
|
|
||||||
"""Helper to create and connect a Thrift client."""
|
|
||||||
if not TSocket or not TTransport or not TBinaryProtocol:
|
|
||||||
raise AirflowException("Required Thrift modules are not available")
|
|
||||||
|
|
||||||
transport = TSocket.TSocket(host, port)
|
|
||||||
transport.setTimeout(timeout * 1000)
|
|
||||||
transport = TTransport.TFramedTransport(transport)
|
|
||||||
protocol = TBinaryProtocol.TBinaryProtocolFactory()
|
|
||||||
client = YTTokenOpService.Client(protocol) if YTTokenOpService else None
|
|
||||||
if client:
|
|
||||||
transport.open()
|
|
||||||
logger.info(f"Connected to Thrift server at {host}:{port}")
|
|
||||||
return client, transport
|
|
||||||
|
|
||||||
def _extract_video_id(url):
|
|
||||||
"""Extracts YouTube video ID from URL."""
|
|
||||||
if not url or not isinstance(url, str):
|
|
||||||
return None
|
|
||||||
patterns = [r'v=([a-zA-Z0-9_-]{11})', r'youtu\.be/([a-zA-Z0-9_-]{11})']
|
|
||||||
for pattern in patterns:
|
|
||||||
match = re.search(pattern, url)
|
|
||||||
if match:
|
|
||||||
return match.group(1)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _get_account_pool(params: dict) -> list:
|
|
||||||
"""
|
|
||||||
Gets the list of accounts to use for processing, filtering out banned/resting accounts.
|
|
||||||
Supports explicit list, prefix-based generation, and single account modes.
|
|
||||||
"""
|
|
||||||
account_pool_str = params.get('account_pool', 'default_account')
|
|
||||||
accounts = []
|
|
||||||
is_prefix_mode = False
|
|
||||||
|
|
||||||
if ',' in account_pool_str:
|
|
||||||
accounts = [acc.strip() for acc in account_pool_str.split(',') if acc.strip()]
|
|
||||||
else:
|
|
||||||
prefix = account_pool_str
|
|
||||||
pool_size_param = params.get('account_pool_size')
|
|
||||||
if pool_size_param is not None:
|
|
||||||
is_prefix_mode = True
|
|
||||||
pool_size = int(pool_size_param)
|
|
||||||
accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
|
|
||||||
else:
|
|
||||||
accounts = [prefix]
|
|
||||||
|
|
||||||
if not accounts:
|
|
||||||
raise AirflowException("Initial account pool is empty.")
|
|
||||||
|
|
||||||
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
|
||||||
try:
|
|
||||||
redis_client = _get_redis_client(redis_conn_id)
|
|
||||||
active_accounts = []
|
|
||||||
for account in accounts:
|
|
||||||
status_bytes = redis_client.hget(f"account_status:{account}", "status")
|
|
||||||
status = status_bytes.decode('utf-8') if status_bytes else "ACTIVE"
|
|
||||||
if status not in ['BANNED'] and 'RESTING' not in status:
|
|
||||||
active_accounts.append(account)
|
|
||||||
|
|
||||||
if not active_accounts and accounts:
|
|
||||||
auto_create = params.get('auto_create_new_accounts_on_exhaustion', False)
|
|
||||||
if auto_create and is_prefix_mode:
|
|
||||||
new_account_id = f"{account_pool_str}-auto-{str(uuid.uuid4())[:8]}"
|
|
||||||
logger.warning(f"Account pool exhausted. Auto-creating new account: '{new_account_id}'")
|
|
||||||
active_accounts.append(new_account_id)
|
|
||||||
else:
|
|
||||||
raise AirflowException("All accounts in the configured pool are currently exhausted.")
|
|
||||||
accounts = active_accounts
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Could not filter accounts from Redis. Using unfiltered pool. Error: {e}", exc_info=True)
|
|
||||||
|
|
||||||
if not accounts:
|
|
||||||
raise AirflowException("Account pool is empty after filtering.")
|
|
||||||
|
|
||||||
logger.info(f"Final active account pool with {len(accounts)} accounts.")
|
|
||||||
return accounts
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# TASK DEFINITIONS (TaskFlow API)
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
@task
|
|
||||||
def get_url_and_assign_account(**context):
|
|
||||||
"""
|
|
||||||
Gets the URL to process from the DAG run configuration and assigns an active account.
|
|
||||||
This is the first task in the pinned-worker DAG.
|
|
||||||
"""
|
|
||||||
params = context['params']
|
|
||||||
|
|
||||||
# Update yt-dlp to latest nightly before every run
|
|
||||||
subprocess.run(["/usr/local/bin/update-yt-dlp.sh"], check=True)
|
|
||||||
|
|
||||||
# The URL is passed by the dispatcher DAG.
|
|
||||||
url_to_process = params.get('url_to_process')
|
|
||||||
if not url_to_process:
|
|
||||||
raise AirflowException("'url_to_process' was not found in the DAG run configuration.")
|
|
||||||
logger.info(f"Received URL '{url_to_process}' to process.")
|
|
||||||
|
|
||||||
# Account assignment logic is the same as before.
|
|
||||||
account_id = random.choice(_get_account_pool(params))
|
|
||||||
logger.info(f"Selected account '{account_id}' for this run.")
|
|
||||||
|
|
||||||
return {
|
|
||||||
'url_to_process': url_to_process,
|
|
||||||
'account_id': account_id,
|
|
||||||
'accounts_tried': [account_id],
|
|
||||||
}
|
|
||||||
|
|
||||||
@task
|
|
||||||
def get_token(initial_data: dict, **context):
|
|
||||||
"""Makes a single attempt to get a token from the Thrift service."""
|
|
||||||
ti = context['task_instance']
|
|
||||||
params = context['params']
|
|
||||||
|
|
||||||
account_id = initial_data['account_id']
|
|
||||||
url = initial_data['url_to_process']
|
|
||||||
info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')
|
|
||||||
|
|
||||||
host, port, timeout = params['service_ip'], int(params['service_port']), int(params.get('timeout', DEFAULT_TIMEOUT))
|
|
||||||
machine_id = params.get('machine_id') or socket.gethostname()
|
|
||||||
|
|
||||||
logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' ---")
|
|
||||||
client, transport = None, None
|
|
||||||
try:
|
|
||||||
client, transport = _get_thrift_client(host, port, timeout)
|
|
||||||
if not client or not TokenUpdateMode:
|
|
||||||
raise AirflowException("Thrift client or TokenUpdateMode not available")
|
|
||||||
|
|
||||||
token_data = client.getOrRefreshToken(accountId=account_id, updateType=TokenUpdateMode.AUTO, url=url, clients=params.get('clients'), machineId=machine_id)
|
|
||||||
|
|
||||||
info_json = getattr(token_data, 'infoJson', None)
|
|
||||||
if not (info_json and json.loads(info_json)):
|
|
||||||
raise AirflowException("Service returned success but info.json was empty or invalid.")
|
|
||||||
|
|
||||||
video_id = _extract_video_id(url)
|
|
||||||
os.makedirs(info_json_dir, exist_ok=True)
|
|
||||||
# Use a readable timestamp for a unique filename on each attempt.
|
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
||||||
info_json_path = os.path.join(info_json_dir, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json")
|
|
||||||
with open(info_json_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(info_json)
|
|
||||||
|
|
||||||
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
|
|
||||||
return {
|
|
||||||
'info_json_path': info_json_path,
|
|
||||||
'socks_proxy': getattr(token_data, proxy_attr) if proxy_attr else None,
|
|
||||||
'ytdlp_command': getattr(token_data, proxy_attr) if proxy_attr else None,
|
|
||||||
'successful_account_id': account_id,
|
|
||||||
'original_url': url, # Include original URL for fallback
|
|
||||||
}
|
|
||||||
except (PBServiceException, PBUserException, TTransportException) as e:
|
|
||||||
error_context = getattr(e, 'context', None)
|
|
||||||
if isinstance(error_context, str):
|
|
||||||
try: error_context = json.loads(error_context.replace("'", "\""))
|
|
||||||
except: pass
|
|
||||||
|
|
||||||
error_details = {
|
|
||||||
'error_message': getattr(e, 'message', str(e)),
|
|
||||||
'error_code': getattr(e, 'errorCode', 'TRANSPORT_ERROR'),
|
|
||||||
'proxy_url': error_context.get('proxy_url') if isinstance(error_context, dict) else None
|
|
||||||
}
|
|
||||||
logger.error(f"Thrift call failed for account '{account_id}'. Exception: {error_details['error_message']}")
|
|
||||||
ti.xcom_push(key='error_details', value=error_details)
|
|
||||||
|
|
||||||
# If it's not a connection error, run diagnostic yt-dlp command
|
|
||||||
if error_details['error_code'] not in ["SOCKS5_CONNECTION_FAILED", "SOCKET_TIMEOUT", "TRANSPORT_ERROR", "CAMOUFOX_TIMEOUT"]:
|
|
||||||
_run_diagnostic_yt_dlp(url, error_details.get('proxy_url'), params.get('clients', 'web'))
|
|
||||||
|
|
||||||
raise AirflowException(f"Thrift call failed: {error_details['error_message']}")
|
|
||||||
finally:
|
|
||||||
if transport and transport.isOpen():
|
|
||||||
transport.close()
|
|
||||||
|
|
||||||
def _run_diagnostic_yt_dlp(url, proxy, clients):
|
|
||||||
"""Runs yt-dlp with diagnostic flags to capture failed responses."""
|
|
||||||
logger.warning("Running diagnostic yt-dlp command to capture failed response...")
|
|
||||||
|
|
||||||
dump_dir = "/opt/airflow/dumps"
|
|
||||||
os.makedirs(dump_dir, exist_ok=True)
|
|
||||||
|
|
||||||
video_id = _extract_video_id(url)
|
|
||||||
dump_file = os.path.join(dump_dir, f"diagnostic_{video_id}_{int(time.time())}.dump")
|
|
||||||
|
|
||||||
cmd = [
|
|
||||||
'yt-dlp',
|
|
||||||
'--extractor-args', f'youtube:player-client={clients}',
|
|
||||||
'--write-pages',
|
|
||||||
'--proxy', proxy or '',
|
|
||||||
'-FvU',
|
|
||||||
url,
|
|
||||||
'--write-info-json',
|
|
||||||
'--print', 'filename',
|
|
||||||
'--continue',
|
|
||||||
'--no-progress',
|
|
||||||
'--no-simulate',
|
|
||||||
'--ignore-errors',
|
|
||||||
'--no-playlist'
|
|
||||||
]
|
|
||||||
|
|
||||||
logger.info(f"Executing diagnostic command: {' '.join(shlex.quote(arg) for arg in cmd)}")
|
|
||||||
logger.info(f"Diagnostic dump will be saved to: {dump_file}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
|
|
||||||
logger.info(f"Diagnostic yt-dlp exit code: {result.returncode}")
|
|
||||||
if result.stdout:
|
|
||||||
logger.info(f"Diagnostic output:\n{result.stdout}")
|
|
||||||
if result.stderr:
|
|
||||||
logger.error(f"Diagnostic stderr:\n{result.stderr}")
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
logger.error("Diagnostic yt-dlp command timed out after 5 minutes")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to run diagnostic yt-dlp: {e}")
|
|
||||||
|
|
||||||
@task.branch
|
|
||||||
def handle_bannable_error_branch(task_id_to_check: str, **context):
|
|
||||||
"""Inspects a failed task and routes to retry logic if the error is bannable."""
|
|
||||||
ti = context['task_instance']
|
|
||||||
params = context['params']
|
|
||||||
error_details = ti.xcom_pull(task_ids=task_id_to_check, key='error_details')
|
|
||||||
if not error_details:
|
|
||||||
return None # Let DAG fail for unexpected errors
|
|
||||||
|
|
||||||
error_code = error_details.get('error_code', '').strip()
|
|
||||||
policy = params.get('on_bannable_failure', 'retry_with_new_account')
|
|
||||||
|
|
||||||
# Connection errors should be retried without banning the account.
|
|
||||||
connection_errors = ['SOCKS5_CONNECTION_FAILED', 'SOCKET_TIMEOUT', 'TRANSPORT_ERROR', 'CAMOUFOX_TIMEOUT']
|
|
||||||
if error_code in connection_errors:
|
|
||||||
logger.info(f"Handling connection error '{error_code}' from '{task_id_to_check}'. Policy: '{policy}'")
|
|
||||||
if policy == 'stop_loop':
|
|
||||||
logger.warning(f"Connection error with 'stop_loop' policy. Failing DAG without banning.")
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
logger.info("Retrying with a new account without banning.")
|
|
||||||
return 'assign_new_account_for_retry'
|
|
||||||
|
|
||||||
is_bannable = error_code in ["BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED"]
|
|
||||||
|
|
||||||
logger.info(f"Handling failure from '{task_id_to_check}'. Error code: '{error_code}', Policy: '{policy}'")
|
|
||||||
if is_bannable and policy in ['retry_with_new_account', 'retry_and_ban_account_only']:
|
|
||||||
return 'ban_account_and_prepare_for_retry'
|
|
||||||
if is_bannable and policy in ['retry_on_connection_error', 'retry_without_ban']:
|
|
||||||
return 'assign_new_account_for_retry'
|
|
||||||
if is_bannable: # stop_loop
|
|
||||||
return 'ban_and_fail'
|
|
||||||
return None # Not a bannable error, let DAG fail
|
|
||||||
@ -1,707 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# vim:fenc=utf-8
|
|
||||||
#
|
|
||||||
# Copyright © 2024 rl <rl@rlmbp>
|
|
||||||
#
|
|
||||||
# Distributed under terms of the MIT license.
|
|
||||||
|
|
||||||
"""
|
|
||||||
DAG for processing YouTube URLs sequentially from a Redis queue using YTDLP Ops Thrift service.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from airflow import DAG
|
|
||||||
from airflow.exceptions import AirflowException, AirflowSkipException, AirflowFailException
|
|
||||||
from airflow.hooks.base import BaseHook
|
|
||||||
from airflow.models import BaseOperator, Variable
|
|
||||||
from airflow.models.param import Param
|
|
||||||
from airflow.operators.bash import BashOperator # Import BashOperator
|
|
||||||
from airflow.operators.python import PythonOperator
|
|
||||||
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
|
|
||||||
from airflow.providers.redis.hooks.redis import RedisHook
|
|
||||||
from airflow.utils.dates import days_ago
|
|
||||||
from airflow.utils.decorators import apply_defaults
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
from pangramia.yt.common.ttypes import TokenUpdateMode
|
|
||||||
from pangramia.yt.exceptions.ttypes import PBServiceException
|
|
||||||
from pangramia.yt.tokens_ops import YTTokenOpService
|
|
||||||
from thrift.protocol import TBinaryProtocol
|
|
||||||
from thrift.transport import TSocket, TTransport
|
|
||||||
from thrift.transport.TTransport import TTransportException
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import redis # Import redis exceptions if needed
|
|
||||||
import socket
|
|
||||||
import time
|
|
||||||
import traceback # For logging stack traces in failure handler
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Default settings
|
|
||||||
DEFAULT_QUEUE_NAME = 'video_queue' # Base name for queues
|
|
||||||
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
|
||||||
DEFAULT_TIMEOUT = 30 # Default Thrift timeout in seconds
|
|
||||||
MAX_RETRIES_REDIS_LOOKUP = 3 # Retries for fetching service details from Redis
|
|
||||||
RETRY_DELAY_REDIS_LOOKUP = 10 # Delay (seconds) for Redis lookup retries
|
|
||||||
|
|
||||||
# --- Helper Functions ---
|
|
||||||
|
|
||||||
from utils.redis_utils import _get_redis_client
|
|
||||||
|
|
||||||
def _extract_video_id(url):
|
|
||||||
"""Extracts YouTube video ID from URL."""
|
|
||||||
if not url or not isinstance(url, str):
|
|
||||||
logger.debug("URL is empty or not a string, cannot extract video ID.")
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
video_id = None
|
|
||||||
if 'youtube.com/watch?v=' in url:
|
|
||||||
video_id = url.split('v=')[1].split('&')[0]
|
|
||||||
elif 'youtu.be/' in url:
|
|
||||||
video_id = url.split('youtu.be/')[1].split('?')[0]
|
|
||||||
|
|
||||||
if video_id and len(video_id) >= 11:
|
|
||||||
video_id = video_id[:11] # Standard ID length
|
|
||||||
logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
|
|
||||||
return video_id
|
|
||||||
else:
|
|
||||||
logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# --- Queue Management Callables ---
|
|
||||||
|
|
||||||
def pop_url_from_queue(**context):
|
|
||||||
"""Pops a URL from the inbox queue and pushes to XCom."""
|
|
||||||
params = context['params']
|
|
||||||
queue_name = params['queue_name']
|
|
||||||
inbox_queue = f"{queue_name}_inbox"
|
|
||||||
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
|
||||||
logger.info(f"Attempting to pop URL from inbox queue: {inbox_queue}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
client = _get_redis_client(redis_conn_id)
|
|
||||||
# LPOP is non-blocking, returns None if empty
|
|
||||||
url_bytes = client.lpop(inbox_queue) # Returns bytes if decode_responses=False on hook/client
|
|
||||||
|
|
||||||
if url_bytes:
|
|
||||||
url = url_bytes.decode('utf-8') if isinstance(url_bytes, bytes) else url_bytes
|
|
||||||
logger.info(f"Popped URL: {url}")
|
|
||||||
context['task_instance'].xcom_push(key='current_url', value=url)
|
|
||||||
return url # Return URL for logging/potential use
|
|
||||||
else:
|
|
||||||
logger.info(f"Inbox queue '{inbox_queue}' is empty. Skipping downstream tasks.")
|
|
||||||
context['task_instance'].xcom_push(key='current_url', value=None)
|
|
||||||
# Raise AirflowSkipException to signal downstream tasks to skip
|
|
||||||
raise AirflowSkipException(f"Inbox queue '{inbox_queue}' is empty.")
|
|
||||||
except AirflowSkipException:
|
|
||||||
raise # Re-raise skip exception
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error popping URL from Redis queue '{inbox_queue}': {e}", exc_info=True)
|
|
||||||
raise AirflowException(f"Failed to pop URL from Redis: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
def move_url_to_progress(**context):
|
|
||||||
"""Moves the current URL from XCom to the progress hash."""
|
|
||||||
ti = context['task_instance']
|
|
||||||
url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url')
|
|
||||||
|
|
||||||
# This task should be skipped if pop_url_from_queue raised AirflowSkipException
|
|
||||||
# Adding check for robustness
|
|
||||||
if not url:
|
|
||||||
logger.info("No URL found in XCom (or upstream skipped). Skipping move to progress.")
|
|
||||||
raise AirflowSkipException("No URL to process.")
|
|
||||||
|
|
||||||
params = context['params']
|
|
||||||
queue_name = params['queue_name']
|
|
||||||
progress_queue = f"{queue_name}_progress"
|
|
||||||
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
|
||||||
logger.info(f"Moving URL '{url}' to progress hash: {progress_queue}")
|
|
||||||
|
|
||||||
progress_data = {
|
|
||||||
'status': 'processing',
|
|
||||||
'start_time': time.time(),
|
|
||||||
'dag_run_id': context['dag_run'].run_id,
|
|
||||||
'task_instance_key_str': context['task_instance_key_str']
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
client = _get_redis_client(redis_conn_id)
|
|
||||||
client.hset(progress_queue, url, json.dumps(progress_data))
|
|
||||||
logger.info(f"Moved URL '{url}' to progress hash '{progress_queue}'.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error moving URL to Redis progress hash '{progress_queue}': {e}", exc_info=True)
|
|
||||||
# If this fails, the URL is popped but not tracked as processing. Fail the task.
|
|
||||||
raise AirflowException(f"Failed to move URL to progress hash: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
def handle_success(**context):
|
|
||||||
"""Moves URL from progress to result hash on success."""
|
|
||||||
ti = context['task_instance']
|
|
||||||
url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url')
|
|
||||||
if not url:
|
|
||||||
logger.warning("handle_success called but no URL found from pop_url_from_queue XCom. This shouldn't happen on success path.")
|
|
||||||
return # Or raise error
|
|
||||||
|
|
||||||
params = context['params']
|
|
||||||
queue_name = params['queue_name']
|
|
||||||
progress_queue = f"{queue_name}_progress"
|
|
||||||
result_queue = f"{queue_name}_result"
|
|
||||||
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
|
||||||
|
|
||||||
# Pull results from get_token task
|
|
||||||
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
|
|
||||||
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
|
|
||||||
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command') # Original command
|
|
||||||
downloaded_file_path = ti.xcom_pull(task_ids='download_video') # Pull from download_video task
|
|
||||||
|
|
||||||
logger.info(f"Handling success for URL: {url}")
|
|
||||||
logger.info(f" Info JSON Path: {info_json_path}")
|
|
||||||
logger.info(f" SOCKS Proxy: {socks_proxy}")
|
|
||||||
logger.info(f" YTDLP Command: {ytdlp_command[:100] if ytdlp_command else 'None'}...") # Log truncated command
|
|
||||||
logger.info(f" Downloaded File Path: {downloaded_file_path}")
|
|
||||||
|
|
||||||
result_data = {
|
|
||||||
'status': 'success',
|
|
||||||
'end_time': time.time(),
|
|
||||||
'info_json_path': info_json_path,
|
|
||||||
'socks_proxy': socks_proxy,
|
|
||||||
'ytdlp_command': ytdlp_command,
|
|
||||||
'downloaded_file_path': downloaded_file_path,
|
|
||||||
'url': url,
|
|
||||||
'dag_run_id': context['dag_run'].run_id,
|
|
||||||
'task_instance_key_str': context['task_instance_key_str'] # Record which task instance succeeded
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
client = _get_redis_client(redis_conn_id)
|
|
||||||
# Remove from progress hash
|
|
||||||
removed_count = client.hdel(progress_queue, url)
|
|
||||||
if removed_count > 0:
|
|
||||||
logger.info(f"Removed URL '{url}' from progress hash '{progress_queue}'.")
|
|
||||||
else:
|
|
||||||
logger.warning(f"URL '{url}' not found in progress hash '{progress_queue}' during success handling.")
|
|
||||||
|
|
||||||
# Add to result hash
|
|
||||||
client.hset(result_queue, url, json.dumps(result_data))
|
|
||||||
logger.info(f"Stored success result for URL '{url}' in result hash '{result_queue}'.")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error handling success in Redis for URL '{url}': {e}", exc_info=True)
|
|
||||||
# Even if Redis fails, the task succeeded. Log error but don't fail the task.
|
|
||||||
# Consider adding retry logic for Redis operations here or marking state differently.
|
|
||||||
|
|
||||||
|
|
||||||
def handle_failure(**context):
|
|
||||||
"""
|
|
||||||
Handles failed processing. Depending on the `requeue_on_failure` parameter,
|
|
||||||
it either moves the URL to the fail hash or re-queues it in the inbox.
|
|
||||||
If `stop_on_failure` is True, this task will fail, stopping the DAG loop.
|
|
||||||
"""
|
|
||||||
ti = context['task_instance']
|
|
||||||
url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url')
|
|
||||||
if not url:
|
|
||||||
logger.error("handle_failure called but no URL found from pop_url_from_queue XCom.")
|
|
||||||
return
|
|
||||||
|
|
||||||
params = context['params']
|
|
||||||
queue_name = params['queue_name']
|
|
||||||
progress_queue = f"{queue_name}_progress"
|
|
||||||
fail_queue = f"{queue_name}_fail"
|
|
||||||
inbox_queue = f"{queue_name}_inbox"
|
|
||||||
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
|
||||||
requeue_on_failure = params.get('requeue_on_failure', False)
|
|
||||||
stop_on_failure = params.get('stop_on_failure', True) # Default to True
|
|
||||||
|
|
||||||
exception = context.get('exception')
|
|
||||||
error_message = str(exception) if exception else "Unknown error"
|
|
||||||
tb_str = traceback.format_exc() if exception else "No traceback available."
|
|
||||||
|
|
||||||
logger.info(f"Handling failure for URL: {url}")
|
|
||||||
logger.error(f" Failure Reason: {error_message}")
|
|
||||||
logger.debug(f" Traceback:\n{tb_str}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
client = _get_redis_client(redis_conn_id)
|
|
||||||
# Always remove from progress hash first
|
|
||||||
removed_count = client.hdel(progress_queue, url)
|
|
||||||
if removed_count > 0:
|
|
||||||
logger.info(f"Removed URL '{url}' from progress hash '{progress_queue}'.")
|
|
||||||
else:
|
|
||||||
logger.warning(f"URL '{url}' not found in progress hash '{progress_queue}' during failure handling.")
|
|
||||||
|
|
||||||
if requeue_on_failure:
|
|
||||||
# Re-queue the URL for another attempt
|
|
||||||
client.rpush(inbox_queue, url)
|
|
||||||
logger.info(f"Re-queued failed URL '{url}' to inbox '{inbox_queue}' for retry.")
|
|
||||||
else:
|
|
||||||
# Move to the permanent fail hash
|
|
||||||
fail_data = {
|
|
||||||
'status': 'failed',
|
|
||||||
'end_time': time.time(),
|
|
||||||
'error': error_message,
|
|
||||||
'traceback': tb_str,
|
|
||||||
'url': url,
|
|
||||||
'dag_run_id': context['dag_run'].run_id,
|
|
||||||
'task_instance_key_str': context['task_instance_key_str']
|
|
||||||
}
|
|
||||||
client.hset(fail_queue, url, json.dumps(fail_data))
|
|
||||||
logger.info(f"Stored failure details for URL '{url}' in fail hash '{fail_queue}'.")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error during failure handling in Redis for URL '{url}': {e}", exc_info=True)
|
|
||||||
# This is a critical error in the failure handling logic itself.
|
|
||||||
raise AirflowException(f"Could not handle failure in Redis: {e}")
|
|
||||||
|
|
||||||
# After handling Redis, decide whether to fail the task to stop the loop
|
|
||||||
if stop_on_failure:
|
|
||||||
logger.error("stop_on_failure is True. Failing this task to stop the DAG loop.")
|
|
||||||
# Re-raise the original exception to fail the task instance.
|
|
||||||
# This is better than AirflowFailException because it preserves the original error.
|
|
||||||
if exception:
|
|
||||||
raise exception
|
|
||||||
else:
|
|
||||||
# If for some reason there's no exception, fail explicitly.
|
|
||||||
raise AirflowFailException("Failing task as per stop_on_failure=True, but original exception was not found.")
|
|
||||||
|
|
||||||
|
|
||||||
# --- YtdlpOpsOperator ---
|
|
||||||
|
|
||||||
class YtdlpOpsOperator(BaseOperator):
|
|
||||||
"""
|
|
||||||
Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections
|
|
||||||
and Redis-based discovery, retrieves tokens, saves info.json, and manages errors.
|
|
||||||
Modified to pull URL from XCom for sequential processing.
|
|
||||||
"""
|
|
||||||
# Removed 'url' from template_fields as it's pulled from XCom
|
|
||||||
template_fields = ('service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir', 'redis_conn_id')
|
|
||||||
|
|
||||||
@apply_defaults
|
|
||||||
def __init__(self,
|
|
||||||
# url parameter removed - will be pulled from XCom
|
|
||||||
redis_conn_id=DEFAULT_REDIS_CONN_ID,
|
|
||||||
max_retries_lookup=MAX_RETRIES_REDIS_LOOKUP,
|
|
||||||
retry_delay_lookup=RETRY_DELAY_REDIS_LOOKUP,
|
|
||||||
service_ip=None,
|
|
||||||
service_port=None,
|
|
||||||
redis_enabled=False, # Default to direct connection now
|
|
||||||
account_id=None,
|
|
||||||
# save_info_json removed, always True
|
|
||||||
info_json_dir=None,
|
|
||||||
# get_socks_proxy removed, always True
|
|
||||||
# store_socks_proxy removed, always True
|
|
||||||
# get_socks_proxy=True, # Removed
|
|
||||||
# store_socks_proxy=True, # Store proxy in XCom by default # Removed
|
|
||||||
timeout=DEFAULT_TIMEOUT,
|
|
||||||
*args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
logger.info(f"Initializing YtdlpOpsOperator (Processor Version) with parameters: "
|
|
||||||
f"redis_conn_id={redis_conn_id}, max_retries_lookup={max_retries_lookup}, retry_delay_lookup={retry_delay_lookup}, "
|
|
||||||
f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, "
|
|
||||||
f"account_id={account_id}, info_json_dir={info_json_dir}, timeout={timeout}")
|
|
||||||
# save_info_json, get_socks_proxy, store_socks_proxy removed from log
|
|
||||||
|
|
||||||
# Validate parameters based on connection mode
|
|
||||||
if redis_enabled:
|
|
||||||
# If using Redis, account_id is essential for lookup
|
|
||||||
if not account_id:
|
|
||||||
raise ValueError("account_id is required when redis_enabled=True for service lookup.")
|
|
||||||
else:
|
|
||||||
# If direct connection, IP and Port are essential
|
|
||||||
if not service_ip or not service_port:
|
|
||||||
raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False.")
|
|
||||||
# Account ID is still needed for the API call itself, but rely on DAG param or operator config
|
|
||||||
if not account_id:
|
|
||||||
logger.warning("No account_id provided for direct connection mode. Ensure it's set in DAG params or operator config.")
|
|
||||||
# We won't assign 'default' here, let the value passed during instantiation be used.
|
|
||||||
|
|
||||||
# self.url is no longer needed here
|
|
||||||
self.redis_conn_id = redis_conn_id
|
|
||||||
self.max_retries_lookup = max_retries_lookup
|
|
||||||
self.retry_delay_lookup = int(retry_delay_lookup.total_seconds() if isinstance(retry_delay_lookup, timedelta) else retry_delay_lookup)
|
|
||||||
self.service_ip = service_ip
|
|
||||||
self.service_port = service_port
|
|
||||||
self.redis_enabled = redis_enabled
|
|
||||||
self.account_id = account_id
|
|
||||||
# self.save_info_json removed
|
|
||||||
self.info_json_dir = info_json_dir # Still needed
|
|
||||||
# self.get_socks_proxy removed
|
|
||||||
# self.store_socks_proxy removed
|
|
||||||
self.timeout = timeout
|
|
||||||
|
|
||||||
def execute(self, context):
|
|
||||||
logger.info("Executing YtdlpOpsOperator (Processor Version)")
|
|
||||||
transport = None
|
|
||||||
ti = context['task_instance'] # Get task instance for XCom access
|
|
||||||
|
|
||||||
try:
|
|
||||||
# --- Get URL from XCom ---
|
|
||||||
url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url')
|
|
||||||
if not url:
|
|
||||||
# This should ideally be caught by upstream skip, but handle defensively
|
|
||||||
logger.info("No URL found in XCom from pop_url_from_queue. Skipping execution.")
|
|
||||||
raise AirflowSkipException("Upstream task did not provide a URL.")
|
|
||||||
logger.info(f"Processing URL from XCom: {url}")
|
|
||||||
# --- End Get URL ---
|
|
||||||
|
|
||||||
logger.info("Getting task parameters and rendering templates")
|
|
||||||
params = context['params'] # DAG run params
|
|
||||||
|
|
||||||
# Render template fields using context
|
|
||||||
# Use render_template_as_native for better type handling if needed, else render_template
|
|
||||||
redis_conn_id = self.render_template(self.redis_conn_id, context)
|
|
||||||
service_ip = self.render_template(self.service_ip, context)
|
|
||||||
service_port_rendered = self.render_template(self.service_port, context)
|
|
||||||
account_id = self.render_template(self.account_id, context)
|
|
||||||
timeout_rendered = self.render_template(self.timeout, context)
|
|
||||||
info_json_dir = self.render_template(self.info_json_dir, context) # Rendered here for _save_info_json
|
|
||||||
|
|
||||||
# Determine effective settings (DAG params override operator defaults)
|
|
||||||
redis_enabled = params.get('redis_enabled', self.redis_enabled)
|
|
||||||
account_id = params.get('account_id', account_id) # Use DAG param if provided
|
|
||||||
redis_conn_id = params.get('redis_conn_id', redis_conn_id) # Use DAG param if provided
|
|
||||||
|
|
||||||
logger.info(f"Effective settings: redis_enabled={redis_enabled}, account_id='{account_id}', redis_conn_id='{redis_conn_id}'")
|
|
||||||
|
|
||||||
host = None
|
|
||||||
port = None
|
|
||||||
|
|
||||||
if redis_enabled:
|
|
||||||
# Get Redis connection using the helper for consistency
|
|
||||||
redis_client = _get_redis_client(redis_conn_id)
|
|
||||||
logger.info(f"Successfully connected to Redis using connection '{redis_conn_id}' for service discovery.")
|
|
||||||
|
|
||||||
# Get service details from Redis with retries
|
|
||||||
service_key = f"ytdlp:{account_id}"
|
|
||||||
legacy_key = account_id # For backward compatibility
|
|
||||||
|
|
||||||
for attempt in range(self.max_retries_lookup):
|
|
||||||
try:
|
|
||||||
logger.info(f"Attempt {attempt + 1}/{self.max_retries_lookup}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'")
|
|
||||||
service_details = redis_client.hgetall(service_key)
|
|
||||||
if not service_details:
|
|
||||||
logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'")
|
|
||||||
service_details = redis_client.hgetall(legacy_key)
|
|
||||||
|
|
||||||
if not service_details:
|
|
||||||
raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}")
|
|
||||||
|
|
||||||
# Find IP and port (case-insensitive keys)
|
|
||||||
ip_key = next((k for k in service_details if k.lower() == 'ip'), None)
|
|
||||||
port_key = next((k for k in service_details if k.lower() == 'port'), None)
|
|
||||||
|
|
||||||
if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}")
|
|
||||||
if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}")
|
|
||||||
|
|
||||||
host = service_details[ip_key] # Assumes decode_responses=True in hook
|
|
||||||
port_str = service_details[port_key]
|
|
||||||
|
|
||||||
try:
|
|
||||||
port = int(port_str)
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}")
|
|
||||||
|
|
||||||
logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}")
|
|
||||||
break # Success
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}")
|
|
||||||
if attempt == self.max_retries_lookup - 1:
|
|
||||||
logger.error("Max retries reached for fetching Redis details.")
|
|
||||||
raise AirflowException(f"Failed to get service details from Redis after {self.max_retries_lookup} attempts: {e}")
|
|
||||||
logger.info(f"Retrying in {self.retry_delay_lookup} seconds...")
|
|
||||||
time.sleep(self.retry_delay_lookup)
|
|
||||||
else:
|
|
||||||
# Direct connection: Use rendered/param values
|
|
||||||
host = params.get('service_ip', service_ip) # Use DAG param if provided
|
|
||||||
port_str = params.get('service_port', service_port_rendered) # Use DAG param if provided
|
|
||||||
|
|
||||||
logger.info(f"Using direct connection settings: service_ip={host}, service_port={port_str}")
|
|
||||||
|
|
||||||
if not host or not port_str:
|
|
||||||
raise ValueError("Direct connection requires service_ip and service_port (check Operator config and DAG params)")
|
|
||||||
try:
|
|
||||||
port = int(port_str)
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
raise ValueError(f"Invalid service_port value: {port_str}")
|
|
||||||
|
|
||||||
logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)")
|
|
||||||
|
|
||||||
# Validate and use timeout
|
|
||||||
try:
|
|
||||||
timeout = int(timeout_rendered)
|
|
||||||
if timeout <= 0: raise ValueError("Timeout must be positive")
|
|
||||||
logger.info(f"Using timeout: {timeout} seconds")
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
logger.warning(f"Invalid timeout value: '{timeout_rendered}'. Using default: {DEFAULT_TIMEOUT}")
|
|
||||||
timeout = DEFAULT_TIMEOUT
|
|
||||||
|
|
||||||
# Create Thrift connection objects
|
|
||||||
# socket_conn = TSocket.TSocket(host, port) # Original
|
|
||||||
socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4)
|
|
||||||
socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds
|
|
||||||
transport = TTransport.TFramedTransport(socket_conn) # Use TFramedTransport if server expects it
|
|
||||||
# transport = TTransport.TBufferedTransport(socket_conn) # Use TBufferedTransport if server expects it
|
|
||||||
protocol = TBinaryProtocol.TBinaryProtocol(transport)
|
|
||||||
client = YTTokenOpService.Client(protocol)
|
|
||||||
|
|
||||||
logger.info(f"Attempting to connect to Thrift server at {host}:{port}...")
|
|
||||||
try:
|
|
||||||
transport.open()
|
|
||||||
logger.info("Successfully connected to Thrift server.")
|
|
||||||
|
|
||||||
# Test connection with ping
|
|
||||||
try:
|
|
||||||
client.ping()
|
|
||||||
logger.info("Server ping successful.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Server ping failed: {e}")
|
|
||||||
raise AirflowException(f"Server connection test (ping) failed: {e}")
|
|
||||||
|
|
||||||
# Get token from service using the URL from XCom
|
|
||||||
try:
|
|
||||||
logger.info(f"Requesting token for accountId='{account_id}', url='{url}'")
|
|
||||||
token_data = client.getOrRefreshToken(
|
|
||||||
accountId=account_id,
|
|
||||||
updateType=TokenUpdateMode.AUTO,
|
|
||||||
url=url # Use the url variable from XCom
|
|
||||||
)
|
|
||||||
logger.info("Successfully retrieved token data from service.")
|
|
||||||
except PBServiceException as e:
|
|
||||||
# Handle specific service exceptions
|
|
||||||
error_code = getattr(e, 'errorCode', 'N/A')
|
|
||||||
error_message = getattr(e, 'message', 'N/A')
|
|
||||||
error_context = getattr(e, 'context', {})
|
|
||||||
logger.error(f"PBServiceException occurred: Code={error_code}, Message={error_message}")
|
|
||||||
if error_context:
|
|
||||||
logger.error(f" Context: {error_context}") # Log context separately
|
|
||||||
# Construct a concise error message for AirflowException
|
|
||||||
error_msg = f"YTDLP service error (Code: {error_code}): {error_message}"
|
|
||||||
# Add specific error code handling if needed...
|
|
||||||
logger.error(f"Failing task instance due to PBServiceException: {error_msg}") # Add explicit log before raising
|
|
||||||
raise AirflowException(error_msg) # Fail task on service error
|
|
||||||
except TTransportException as e:
|
|
||||||
logger.error(f"Thrift transport error during getOrRefreshToken: {e}")
|
|
||||||
logger.error(f"Failing task instance due to TTransportException: {e}") # Add explicit log before raising
|
|
||||||
raise AirflowException(f"Transport error during API call: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error during getOrRefreshToken: {e}")
|
|
||||||
logger.error(f"Failing task instance due to unexpected error during API call: {e}") # Add explicit log before raising
|
|
||||||
raise AirflowException(f"Unexpected error during API call: {e}")
|
|
||||||
|
|
||||||
except TTransportException as e:
|
|
||||||
# Handle connection errors
|
|
||||||
logger.error(f"Thrift transport error during connection: {str(e)}")
|
|
||||||
logger.error(f"Failing task instance due to TTransportException during connection: {e}") # Add explicit log before raising
|
|
||||||
raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}")
|
|
||||||
# Removed the overly broad except Exception block here, as inner blocks raise AirflowException
|
|
||||||
|
|
||||||
# --- Process Token Data ---
|
|
||||||
logger.debug(f"Token data received. Attributes: {dir(token_data)}")
|
|
||||||
|
|
||||||
info_json_path = None # Initialize
|
|
||||||
|
|
||||||
# save_info_json is now always True
|
|
||||||
logger.info("Proceeding to save info.json (save_info_json=True).")
|
|
||||||
info_json = self._get_info_json(token_data)
|
|
||||||
if info_json and self._is_valid_json(info_json):
|
|
||||||
try:
|
|
||||||
# Pass rendered info_json_dir to helper
|
|
||||||
info_json_path = self._save_info_json(context, info_json, url, account_id, info_json_dir)
|
|
||||||
if info_json_path:
|
|
||||||
ti.xcom_push(key='info_json_path', value=info_json_path)
|
|
||||||
logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}")
|
|
||||||
else:
|
|
||||||
ti.xcom_push(key='info_json_path', value=None)
|
|
||||||
logger.warning("info.json saving failed (check logs from _save_info_json).")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True)
|
|
||||||
ti.xcom_push(key='info_json_path', value=None)
|
|
||||||
elif info_json:
|
|
||||||
logger.warning("Retrieved infoJson is not valid JSON. Skipping save.")
|
|
||||||
ti.xcom_push(key='info_json_path', value=None)
|
|
||||||
else:
|
|
||||||
logger.info("No infoJson found in token data. Skipping save.")
|
|
||||||
ti.xcom_push(key='info_json_path', value=None)
|
|
||||||
|
|
||||||
|
|
||||||
# Extract and potentially store SOCKS proxy
|
|
||||||
# get_socks_proxy and store_socks_proxy are now always True
|
|
||||||
socks_proxy = None
|
|
||||||
logger.info("Attempting to extract SOCKS proxy (get_socks_proxy=True).")
|
|
||||||
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
|
|
||||||
if proxy_attr:
|
|
||||||
socks_proxy = getattr(token_data, proxy_attr)
|
|
||||||
if socks_proxy:
|
|
||||||
logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
|
|
||||||
# Always store if found (store_socks_proxy=True)
|
|
||||||
ti.xcom_push(key='socks_proxy', value=socks_proxy)
|
|
||||||
logger.info("Pushed 'socks_proxy' to XCom.")
|
|
||||||
else:
|
|
||||||
logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty.")
|
|
||||||
# Store None if attribute found but empty
|
|
||||||
ti.xcom_push(key='socks_proxy', value=None)
|
|
||||||
logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.")
|
|
||||||
else:
|
|
||||||
logger.info("No SOCKS proxy attribute found in token data.")
|
|
||||||
# Store None if attribute not found
|
|
||||||
ti.xcom_push(key='socks_proxy', value=None)
|
|
||||||
logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.")
|
|
||||||
|
|
||||||
|
|
||||||
# --- Removed old logic block ---
|
|
||||||
# # Extract and potentially store SOCKS proxy
|
|
||||||
# socks_proxy = None
|
|
||||||
# get_socks_proxy = params.get('get_socks_proxy', self.get_socks_proxy)
|
|
||||||
# store_socks_proxy = params.get('store_socks_proxy', self.store_socks_proxy)
|
|
||||||
#
|
|
||||||
# if get_socks_proxy:
|
|
||||||
# proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
|
|
||||||
# if proxy_attr:
|
|
||||||
# socks_proxy = getattr(token_data, proxy_attr)
|
|
||||||
# if socks_proxy:
|
|
||||||
# logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
|
|
||||||
# if store_socks_proxy:
|
|
||||||
# ti.xcom_push(key='socks_proxy', value=socks_proxy)
|
|
||||||
# logger.info("Pushed 'socks_proxy' to XCom.")
|
|
||||||
# else:
|
|
||||||
# logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty.")
|
|
||||||
# if store_socks_proxy: ti.xcom_push(key='socks_proxy', value=None)
|
|
||||||
# else:
|
|
||||||
# logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found.")
|
|
||||||
# if store_socks_proxy: ti.xcom_push(key='socks_proxy', value=None)
|
|
||||||
# else:
|
|
||||||
# logger.info("get_socks_proxy is False. Skipping proxy extraction.")
|
|
||||||
# if store_socks_proxy: ti.xcom_push(key='socks_proxy', value=None)
|
|
||||||
# --- End Removed old logic block ---
|
|
||||||
|
|
||||||
|
|
||||||
# Get the original command from the server, or construct a fallback
|
|
||||||
ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
|
|
||||||
if ytdlp_cmd:
|
|
||||||
logger.info(f"Original command received from server: {ytdlp_cmd[:100]}...") # Log truncated
|
|
||||||
else:
|
|
||||||
logger.warning("No 'ytdlpCommand' attribute found in token data. Constructing a fallback for logging.")
|
|
||||||
# Construct a representative command for logging purposes
|
|
||||||
if socks_proxy:
|
|
||||||
ytdlp_cmd = f"yt-dlp --dump-json --proxy \"{socks_proxy}\" \"{url}\""
|
|
||||||
else:
|
|
||||||
ytdlp_cmd = f"yt-dlp --dump-json \"{url}\""
|
|
||||||
logger.info(f"Constructed fallback command: {ytdlp_cmd}")
|
|
||||||
|
|
||||||
# Push the command to XCom
|
|
||||||
ti.xcom_push(key='ytdlp_command', value=ytdlp_cmd)
|
|
||||||
logger.info("Pushed command to XCom key 'ytdlp_command'.")
|
|
||||||
|
|
||||||
# No explicit return needed, success is implicit if no exception raised
|
|
||||||
|
|
||||||
except (AirflowSkipException, AirflowFailException) as e:
|
|
||||||
logger.info(f"Task skipped or failed explicitly: {e}")
|
|
||||||
raise # Re-raise to let Airflow handle state
|
|
||||||
except AirflowException as e: # Catch AirflowExceptions raised explicitly
|
|
||||||
logger.error(f"Operation failed due to AirflowException: {e}", exc_info=True)
|
|
||||||
raise # Re-raise AirflowExceptions to ensure task failure
|
|
||||||
except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already handled inside inner try
|
|
||||||
logger.error(f"Unhandled YTDLP Service/Transport error in outer block: {e}", exc_info=True)
|
|
||||||
logger.error(f"Failing task instance due to unhandled outer Service/Transport error: {e}") # Add explicit log before raising
|
|
||||||
raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException to fail task
|
|
||||||
except Exception as e: # General catch-all for truly unexpected errors
|
|
||||||
logger.error(f"Caught unexpected error in YtdlpOpsOperator outer block: {e}", exc_info=True)
|
|
||||||
logger.error(f"Failing task instance due to unexpected outer error: {e}") # Add explicit log before raising
|
|
||||||
raise AirflowException(f"Unexpected error caused task failure: {e}") # Wrap to fail task
|
|
||||||
finally:
|
|
||||||
if transport and transport.isOpen():
|
|
||||||
logger.info("Closing Thrift transport.")
|
|
||||||
transport.close()
|
|
||||||
|
|
||||||
# --- Helper Methods ---
|
|
||||||
|
|
||||||
def _get_info_json(self, token_data):
|
|
||||||
"""Safely extracts infoJson from token data."""
|
|
||||||
return getattr(token_data, 'infoJson', None)
|
|
||||||
|
|
||||||
def _is_valid_json(self, json_str):
|
|
||||||
"""Checks if a string is valid JSON."""
|
|
||||||
if not json_str or not isinstance(json_str, str): return False
|
|
||||||
try:
|
|
||||||
json.loads(json_str)
|
|
||||||
return True
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _save_info_json(self, context, info_json, url, account_id, rendered_info_json_dir):
|
|
||||||
"""Saves info_json to a file. Uses pre-rendered directory path."""
|
|
||||||
try:
|
|
||||||
video_id = _extract_video_id(url) # Use standalone helper
|
|
||||||
|
|
||||||
save_dir = rendered_info_json_dir or "." # Use rendered path
|
|
||||||
logger.info(f"Target directory for info.json: {save_dir}")
|
|
||||||
|
|
||||||
# Ensure directory exists
|
|
||||||
try:
|
|
||||||
os.makedirs(save_dir, exist_ok=True)
|
|
||||||
logger.info(f"Ensured directory exists: {save_dir}")
|
|
||||||
except OSError as e:
|
|
||||||
logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Construct filename
|
|
||||||
timestamp = int(time.time())
|
|
||||||
base_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json"
|
|
||||||
info_json_path = os.path.join(save_dir, base_filename)
|
|
||||||
latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy
|
|
||||||
|
|
||||||
# Write to timestamped file
|
|
||||||
try:
|
|
||||||
logger.info(f"Writing info.json content (received from service) to {info_json_path}...")
|
|
||||||
with open(info_json_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(info_json)
|
|
||||||
logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}")
|
|
||||||
except IOError as e:
|
|
||||||
logger.error(f"Failed to write info.json to {info_json_path}: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Write to latest.json (overwrite) - best effort
|
|
||||||
try:
|
|
||||||
with open(latest_json_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(info_json)
|
|
||||||
logger.info(f"Updated latest.json file: {latest_json_path}")
|
|
||||||
except IOError as e:
|
|
||||||
logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}")
|
|
||||||
|
|
||||||
return info_json_path
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# DAG Definition
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
default_args = {
|
|
||||||
'owner': 'airflow',
|
|
||||||
'depends_on_past': False,
|
|
||||||
'email_on_failure': False,
|
|
||||||
'email_on_retry': False,
|
|
||||||
'retries': 1, # Default retries for tasks like queue management
|
|
||||||
'retry_delay': timedelta(minutes=1),
|
|
||||||
'start_date': days_ago(1),
|
|
||||||
# Add concurrency control if needed for sequential processing
|
|
||||||
# 'concurrency': 1, # Ensure only one task instance runs at a time per DAG run
|
|
||||||
# 'max_active_runs': 1, # Ensure only one DAG run is active
|
|
||||||
}
|
|
||||||
|
|
||||||
# Define DAG
|
|
||||||
#
|
|
||||||
# --- DAG Block Deactivated on 2025-07-16 ---
|
|
||||||
# This DAG has been replaced by the Sensor/Worker pattern implemented in:
|
|
||||||
# - ytdlp_sensor_redis_queue.py (polls the queue)
|
|
||||||
# - ytdlp_worker_per_url.py (processes a single URL)
|
|
||||||
# This code is kept for reference but is not active.
|
|
||||||
#
|
|
||||||
@ -1,974 +0,0 @@
|
|||||||
"""
|
|
||||||
DAG to deploy and manage YTDLP token service.
|
|
||||||
|
|
||||||
This DAG handles the deployment, monitoring, and cleanup of a YTDLP token service
|
|
||||||
for a given account. It supports both Redis-based service discovery and direct
|
|
||||||
connection via manually specified host and port.
|
|
||||||
|
|
||||||
Configuration Options:
|
|
||||||
- account_id: (Required) The account ID for which the service is being deployed.
|
|
||||||
- proxy: (Optional) The proxy to use for the service.
|
|
||||||
- redis_enabled: (Optional, default=True) Whether to use Redis for service discovery.
|
|
||||||
If False, you must provide `host` and `port` manually.
|
|
||||||
- host: (Optional) The host IP of the service. Required if `redis_enabled=False`.
|
|
||||||
- port: (Optional) The port of the service. Required if `redis_enabled=False`.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
1. Redis-based service discovery:
|
|
||||||
- Set `redis_enabled=True` (default).
|
|
||||||
- Ensure Redis is configured in Airflow connections.
|
|
||||||
- The DAG will automatically discover the service IP and port from Redis.
|
|
||||||
|
|
||||||
2. Manual host and port:
|
|
||||||
- Set `redis_enabled=False`.
|
|
||||||
- Provide `host` and `port` manually in the DAG configuration.
|
|
||||||
- Example: {"host": "192.168.1.100", "port": 9090}.
|
|
||||||
|
|
||||||
Example Trigger Configuration:
|
|
||||||
{
|
|
||||||
"account_id": "test_account",
|
|
||||||
"proxy": "socks5://proxy.example.com:1080",
|
|
||||||
"redis_enabled": False,
|
|
||||||
"host": "192.168.1.100",
|
|
||||||
"port": 9090
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
from airflow import DAG
|
|
||||||
from airflow.models.param import Param
|
|
||||||
from airflow.operators.empty import EmptyOperator
|
|
||||||
from airflow.operators.python import PythonOperator
|
|
||||||
# HttpSensor is no longer used
|
|
||||||
# from airflow.providers.http.sensors.http import HttpSensor
|
|
||||||
from airflow.utils.trigger_rule import TriggerRule
|
|
||||||
from airflow.hooks.base import BaseHook
|
|
||||||
from airflow.exceptions import AirflowException
|
|
||||||
from typing import Sequence # Add Sequence for type hinting
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
from airflow.utils.dates import days_ago # Add this import
|
|
||||||
import uuid
|
|
||||||
import os
|
|
||||||
import logging
|
|
||||||
import shutil
|
|
||||||
import docker
|
|
||||||
import uuid
|
|
||||||
import redis
|
|
||||||
import requests
|
|
||||||
import socket
|
|
||||||
import time
|
|
||||||
import sys # Import sys for maxsize
|
|
||||||
from airflow.configuration import conf # Import conf
|
|
||||||
|
|
||||||
# Import and apply Thrift exceptions patch
|
|
||||||
try:
|
|
||||||
# Always apply the patch, regardless of environment
|
|
||||||
from thrift_exceptions_patch import patch_thrift_exceptions
|
|
||||||
patch_thrift_exceptions()
|
|
||||||
logging.info("Applied Thrift exceptions patch for Airflow compatibility")
|
|
||||||
|
|
||||||
# Verify the patch was applied correctly
|
|
||||||
try:
|
|
||||||
from pangramia.yt.exceptions.ttypes import PBServiceException
|
|
||||||
test_exception = PBServiceException(message="Test")
|
|
||||||
# Try to modify attributes to verify patch works
|
|
||||||
test_exception.args = ("Test",)
|
|
||||||
test_exception.message = "Modified test"
|
|
||||||
logging.info("Verified Thrift exception patch is working correctly")
|
|
||||||
except Exception as verify_error:
|
|
||||||
logging.error(f"Thrift exception patch verification failed: {verify_error}")
|
|
||||||
logging.error("This may cause 'immutable instance' errors during error handling")
|
|
||||||
except ImportError as e:
|
|
||||||
logging.warning(f"Could not import thrift_exceptions_patch: {e}")
|
|
||||||
logging.warning("Airflow compatibility will be affected - expect 'immutable instance' errors")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error applying Thrift exceptions patch: {e}")
|
|
||||||
|
|
||||||
# Default arguments for the DAG
|
|
||||||
default_args = {
|
|
||||||
'owner': 'airflow',
|
|
||||||
'depends_on_past': False,
|
|
||||||
'email_on_failure': False,
|
|
||||||
'email_on_retry': False,
|
|
||||||
'retries': 0, # Disable retries for all tasks in this DAG
|
|
||||||
'retry_delay': timedelta(minutes=5),
|
|
||||||
# Removed 'queue': 'auth_queue' to use the default queue
|
|
||||||
# Optional: Further filter workers by tags if using CeleryExecutor
|
|
||||||
'executor_config': {"CeleryExecutor": {"tags": ["auth_node"]}},
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_redis_connection(redis_host=None, redis_port=None):
|
|
||||||
"""Get a Redis connection using Airflow's Redis connection or manually specified host/port."""
|
|
||||||
if redis_host and redis_port:
|
|
||||||
# Use manually specified host and port
|
|
||||||
return redis.Redis(
|
|
||||||
host=redis_host,
|
|
||||||
port=redis_port,
|
|
||||||
db=0,
|
|
||||||
decode_responses=True
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Use Airflow's Redis connection
|
|
||||||
redis_conn = BaseHook.get_connection("redis_default")
|
|
||||||
# Use the password from the connection if available, otherwise use 'airflow' as default
|
|
||||||
password = redis_conn.password or 'airflow'
|
|
||||||
return redis.Redis(
|
|
||||||
host=redis_conn.host, # 'redis' (service name in docker-compose)
|
|
||||||
port=redis_conn.port, # 6379
|
|
||||||
password=password,
|
|
||||||
db=0,
|
|
||||||
decode_responses=True
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_free_port():
|
|
||||||
"""Find and return a free port."""
|
|
||||||
import socket
|
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
||||||
s.bind(('0.0.0.0', 0))
|
|
||||||
return s.getsockname()[1]
|
|
||||||
|
|
||||||
def is_port_free(p):
|
|
||||||
"""Check if a port is free to use."""
|
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
||||||
try:
|
|
||||||
s.bind(('0.0.0.0', p))
|
|
||||||
return True
|
|
||||||
except OSError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def store_account_metadata(account_id, ip, port, proxy=None, health_port=None, container_id=None):
|
|
||||||
"""Store account metadata in Redis."""
|
|
||||||
redis_client = get_redis_connection()
|
|
||||||
try:
|
|
||||||
# Verify Redis connection
|
|
||||||
if not redis_client.ping():
|
|
||||||
raise ConnectionError("Failed to connect to Redis")
|
|
||||||
|
|
||||||
# Store main account metadata
|
|
||||||
mapping = {
|
|
||||||
"ip": ip,
|
|
||||||
"port": str(port),
|
|
||||||
"status": "running",
|
|
||||||
"start_time": str(time.time())
|
|
||||||
}
|
|
||||||
if proxy:
|
|
||||||
mapping["proxy"] = proxy
|
|
||||||
if health_port:
|
|
||||||
mapping["health_port"] = str(health_port)
|
|
||||||
if container_id:
|
|
||||||
mapping["container_id"] = container_id
|
|
||||||
|
|
||||||
# Use pipeline for atomic operations
|
|
||||||
with redis_client.pipeline() as pipe:
|
|
||||||
# Store main metadata
|
|
||||||
pipe.hset(f"ytdlp:{account_id}", mapping=mapping)
|
|
||||||
# Set expiration (1 week)
|
|
||||||
pipe.expire(f"ytdlp:{account_id}", 604800)
|
|
||||||
# Add to account list
|
|
||||||
pipe.sadd("ytdlp:accounts", account_id)
|
|
||||||
# Execute all commands
|
|
||||||
results = pipe.execute()
|
|
||||||
|
|
||||||
# Verify all commands succeeded
|
|
||||||
if not all(results):
|
|
||||||
raise RuntimeError(f"Failed to store metadata for {account_id}. Pipeline results: {results}")
|
|
||||||
|
|
||||||
# Verify the data was actually stored
|
|
||||||
stored_data = redis_client.hgetall(f"ytdlp:{account_id}")
|
|
||||||
if not stored_data:
|
|
||||||
raise RuntimeError(f"Failed to verify stored data for {account_id}")
|
|
||||||
|
|
||||||
logging.info(f"Successfully stored account metadata for {account_id} in Redis: {stored_data}")
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to store account metadata for {account_id}: {e}", exc_info=True)
|
|
||||||
# Attempt cleanup if storage failed
|
|
||||||
try:
|
|
||||||
redis_client = get_redis_connection() # Ensure client is available
|
|
||||||
redis_client.delete(f"ytdlp:{account_id}")
|
|
||||||
redis_client.srem("ytdlp:accounts", account_id)
|
|
||||||
except Exception as cleanup_error:
|
|
||||||
logging.error(f"Failed to cleanup failed storage for {account_id}: {cleanup_error}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Removed get_account_metadata function as the service now handles Redis registration checks.
|
|
||||||
|
|
||||||
def prepare_and_deploy_service(**context):
|
|
||||||
"""Prepare deployment and deploy the Docker service."""
|
|
||||||
# Retrieve account_id, proxy, clients, and other parameters from DAG run configuration (conf)
|
|
||||||
# Set default values for account_id, proxy, and redis_enabled
|
|
||||||
account_id = context['dag_run'].conf.get('account_id') or context['params'].get('account_id', 'account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09')
|
|
||||||
proxy = context['dag_run'].conf.get('proxy') or context['params'].get('proxy', 'socks5://sslocal-rust-1084:1084')
|
|
||||||
clients = context['dag_run'].conf.get('clients') or context['params'].get('clients', 'ios,android,mweb')
|
|
||||||
redis_enabled = context['dag_run'].conf.get('redis_enabled', False) # Default to False
|
|
||||||
host_param = context['dag_run'].conf.get('host') # Host parameter from config
|
|
||||||
port_param = context['dag_run'].conf.get('port') # Port parameter from config
|
|
||||||
docker_network = context['dag_run'].conf.get('docker_network') or context['params'].get('docker_network', 'airflow_prod_proxynet')
|
|
||||||
host_external_ip_env = os.getenv('HOST_EXTERNAL_IP') # Explicit external IP from environment
|
|
||||||
|
|
||||||
if not account_id:
|
|
||||||
raise ValueError("Account ID is missing.")
|
|
||||||
|
|
||||||
# --- Port Determination ---
|
|
||||||
# Assign a free port if not provided, or validate the provided one
|
|
||||||
if not port_param:
|
|
||||||
port = get_free_port()
|
|
||||||
if not is_port_free(port):
|
|
||||||
raise ValueError(f"Assigned port {port} is already in use")
|
|
||||||
logging.info(f"No port provided, assigned free port: {port}")
|
|
||||||
else:
|
|
||||||
port = int(port_param)
|
|
||||||
if not is_port_free(port):
|
|
||||||
raise ValueError(f"Provided port {port} is already in use")
|
|
||||||
logging.info(f"Using provided port: {port}")
|
|
||||||
|
|
||||||
# Determine health port
|
|
||||||
health_port = port + 1
|
|
||||||
if not is_port_free(health_port):
|
|
||||||
raise ValueError(f"Health port {health_port} (derived from port {port}) is already in use")
|
|
||||||
logging.info(f"Using health port: {health_port}")
|
|
||||||
|
|
||||||
|
|
||||||
# --- Host Determination ---
|
|
||||||
# host_for_registration: IP/Host for client discovery (Redis/Logs)
|
|
||||||
# host_for_sensor: Hostname/IP for Airflow HttpSensor health check
|
|
||||||
|
|
||||||
host_for_registration = host_param # Start with the parameter value
|
|
||||||
|
|
||||||
if redis_enabled:
|
|
||||||
# If Redis is enabled, registration host should ideally be externally reachable
|
|
||||||
if not host_for_registration:
|
|
||||||
host_for_registration = host_external_ip_env # Use external IP from env var if available
|
|
||||||
if not host_for_registration:
|
|
||||||
# If no env var, try fetching external IP using requests
|
|
||||||
try:
|
|
||||||
logging.info("HOST_EXTERNAL_IP not set. Attempting to fetch external IP from api.ipify.org...")
|
|
||||||
response = requests.get('https://api.ipify.org', timeout=10) # 10 second timeout
|
|
||||||
response.raise_for_status() # Raise exception for bad status codes
|
|
||||||
host_for_registration = response.text.strip()
|
|
||||||
if not host_for_registration: # Check if response was empty
|
|
||||||
raise ValueError("Received empty response from api.ipify.org")
|
|
||||||
logging.info(f"Successfully fetched external IP: {host_for_registration}")
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
logging.warning(f"Failed to fetch external IP: {e}. Falling back to Docker bridge IP.")
|
|
||||||
# Fallback to default Docker bridge IP if fetching fails
|
|
||||||
host_for_registration = "172.17.0.1"
|
|
||||||
logging.warning(f"Defaulting registration host to Docker bridge IP: {host_for_registration}. Ensure clients can reach this IP.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Unexpected error fetching external IP: {e}. Falling back to Docker bridge IP.")
|
|
||||||
host_for_registration = "172.17.0.1"
|
|
||||||
logging.warning(f"Defaulting registration host to Docker bridge IP: {host_for_registration}. Ensure clients can reach this IP.")
|
|
||||||
else:
|
|
||||||
logging.info(f"Redis enabled. Using HOST_EXTERNAL_IP environment variable for registration: {host_for_registration}")
|
|
||||||
else:
|
|
||||||
logging.info(f"Redis enabled. Using provided host parameter for registration: {host_for_registration}")
|
|
||||||
else: # Redis disabled
|
|
||||||
# If Redis is disabled, registration host defaults to 0.0.0.0 if not provided
|
|
||||||
if not host_for_registration:
|
|
||||||
host_for_registration = "0.0.0.0"
|
|
||||||
logging.warning(f"Redis disabled and no host param provided. Defaulting registration host to {host_for_registration}.")
|
|
||||||
else:
|
|
||||||
logging.info(f"Redis disabled. Using provided host parameter for registration: {host_for_registration}")
|
|
||||||
|
|
||||||
# host_for_sensor determination will happen *after* container creation, using container name.
|
|
||||||
|
|
||||||
logging.info(f"Preparing deployment for account {account_id}. Registration Host: {host_for_registration}, Port: {port}, Health Port: {health_port}")
|
|
||||||
|
|
||||||
# Generate unique work ID and context directory
|
|
||||||
work_id = str(uuid.uuid4())
|
|
||||||
context['task_instance'].xcom_push(key='work_id', value=work_id)
|
|
||||||
|
|
||||||
context_dir = os.path.join(os.getenv('AIRFLOW_HOME', '/tmp'), 'service-data', work_id, 'context-data')
|
|
||||||
os.makedirs(context_dir, exist_ok=True, mode=0o777)
|
|
||||||
os.chmod(context_dir, 0o777)
|
|
||||||
|
|
||||||
# Push context directory and account details to XCom
|
|
||||||
context['task_instance'].xcom_push(key='context_dir', value=context_dir)
|
|
||||||
context['task_instance'].xcom_push(key='account_id', value=account_id)
|
|
||||||
|
|
||||||
# Deploy the Docker service
|
|
||||||
# The 'host_for_registration' variable here represents the externally accessible IP for registration/XCom.
|
|
||||||
# The service inside the container will listen on 0.0.0.0.
|
|
||||||
logging.info(f"Deploying service for account {account_id}. Registration Host: {host_for_registration}, Port: {port}")
|
|
||||||
|
|
||||||
# Get Redis connection details ONLY if redis_enabled (for the container to register itself)
|
|
||||||
redis_host_for_container = ''
|
|
||||||
redis_port_for_container = ''
|
|
||||||
redis_password_for_container = ''
|
|
||||||
if redis_enabled:
|
|
||||||
try:
|
|
||||||
# Get connection details to pass to the container environment
|
|
||||||
redis_conn_details = get_redis_connection().connection_pool.connection_kwargs
|
|
||||||
redis_host_for_container = os.getenv('REDIS_HOST', redis_conn_details.get('host', 'redis'))
|
|
||||||
redis_port_for_container = str(os.getenv('REDIS_PORT', redis_conn_details.get('port', 6379)))
|
|
||||||
redis_password_for_container = os.getenv('REDIS_PASSWORD', redis_conn_details.get('password', ''))
|
|
||||||
logging.info(f"Redis enabled. Passing REDIS_HOST={redis_host_for_container}, REDIS_PORT={redis_port_for_container} to container.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to get Redis connection details for container environment: {e}")
|
|
||||||
logging.warning("Proceeding without Redis details in container environment due to error.")
|
|
||||||
# Depending on container requirements, you might want to raise an error here instead
|
|
||||||
else:
|
|
||||||
logging.info("Redis disabled. Not passing REDIS_HOST/REDIS_PORT to container environment.")
|
|
||||||
|
|
||||||
|
|
||||||
# Get Docker connection details from Airflow
|
|
||||||
try:
|
|
||||||
secrets_backend = conf.get('secrets', 'backend', fallback='None')
|
|
||||||
logging.info(f"Attempting to get 'docker_hub' connection. Configured secrets backend: {secrets_backend}")
|
|
||||||
docker_conn = BaseHook.get_connection("docker_hub")
|
|
||||||
docker_username = docker_conn.login
|
|
||||||
docker_password = docker_conn.password
|
|
||||||
logging.info("Successfully retrieved 'docker_hub' connection.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to retrieve 'docker_hub' connection: {e}")
|
|
||||||
# Log details about potential secrets backend issues
|
|
||||||
secrets_backend_kwargs = conf.get('secrets', 'backend_kwargs', fallback='{}')
|
|
||||||
logging.error(f"Secrets backend details: backend={secrets_backend}, kwargs={secrets_backend_kwargs}")
|
|
||||||
# Re-raise the exception to fail the task
|
|
||||||
raise
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Initialize Docker client to connect to docker-socket-proxy
|
|
||||||
client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
|
|
||||||
|
|
||||||
# Authenticate with Docker Hub
|
|
||||||
client.login(
|
|
||||||
username=docker_username,
|
|
||||||
password=docker_password,
|
|
||||||
registry=docker_conn.host # Typically "https://index.docker.io/v1/"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate a unique container name
|
|
||||||
container_name = f"ytdlp_service_{account_id}_{uuid.uuid4().hex[:8]}"
|
|
||||||
|
|
||||||
# Pull the Docker image (if not already present)
|
|
||||||
client.images.pull('pangramia/ytdlp-ops-server:latest')
|
|
||||||
|
|
||||||
# Use the configured network name (from params or default)
|
|
||||||
network_name = docker_network # Use the retrieved parameter
|
|
||||||
logging.info(f"Attempting to run container on network: {network_name}")
|
|
||||||
|
|
||||||
# Determine if --probe flag should be added based on DAG param
|
|
||||||
exit_on_proxy_fail = context['dag_run'].conf.get('exit_on_proxy_fail', True) # Default to True if not set
|
|
||||||
command_args = [
|
|
||||||
'--script-dir', '/app/scripts',
|
|
||||||
'--context-dir', '/app/context-data', # Use the bind mount target inside container
|
|
||||||
'--port', str(port),
|
|
||||||
'--health-port', str(health_port),
|
|
||||||
'--clients', clients,
|
|
||||||
'--timeout', '120',
|
|
||||||
'--proxy', proxy if proxy else '',
|
|
||||||
'--server-identity', account_id, # Use account_id as server identity
|
|
||||||
]
|
|
||||||
if redis_enabled:
|
|
||||||
command_args.extend(['--redis-host', redis_host_for_container])
|
|
||||||
command_args.extend(['--redis-port', redis_port_for_container])
|
|
||||||
|
|
||||||
if exit_on_proxy_fail:
|
|
||||||
command_args.append('--probe')
|
|
||||||
logging.info("Adding --probe flag to container command as exit_on_proxy_fail=True")
|
|
||||||
else:
|
|
||||||
logging.info("Not adding --probe flag to container command as exit_on_proxy_fail=False")
|
|
||||||
|
|
||||||
# Run the Docker container with health port
|
|
||||||
container = client.containers.run(
|
|
||||||
image='pangramia/ytdlp-ops-server:latest',
|
|
||||||
command=command_args, # Use the constructed command list
|
|
||||||
environment={
|
|
||||||
'PYTHONUNBUFFERED': '1', # Ensure logs are not buffered
|
|
||||||
'SERVER_PORT': str(port), # Port the service listens on *inside* the container
|
|
||||||
'SERVER_HOST': '0.0.0.0', # Service should listen on all interfaces *inside* the container
|
|
||||||
'ACCOUNT_ID': account_id,
|
|
||||||
# Pass Redis details *if enabled* for the service to register itself
|
|
||||||
'REDIS_HOST': redis_host_for_container,
|
|
||||||
'REDIS_PORT': redis_port_for_container,
|
|
||||||
'REDIS_PASSWORD': redis_password_for_container,
|
|
||||||
# Pass PROXY_URL for health check access
|
|
||||||
'PROXY_URL': proxy if proxy else '',
|
|
||||||
},
|
|
||||||
ports={
|
|
||||||
f"{port}/tcp": port,
|
|
||||||
f"{health_port}/tcp": health_port
|
|
||||||
},
|
|
||||||
volumes={
|
|
||||||
context_dir: {'bind': '/app/context-data', 'mode': 'rw'}
|
|
||||||
},
|
|
||||||
network_mode=network_name, # Use the specified network variable
|
|
||||||
auto_remove=False, # Do not auto-remove the container
|
|
||||||
name=container_name, # Use a unique name
|
|
||||||
detach=True,
|
|
||||||
tty=True,
|
|
||||||
shm_size='256m',
|
|
||||||
# Updated healthcheck to test external connectivity via proxy
|
|
||||||
healthcheck={
|
|
||||||
# Use CMD-SHELL to allow conditional logic based on PROXY_URL env var
|
|
||||||
'test': [
|
|
||||||
'CMD-SHELL',
|
|
||||||
# Script checks if PROXY_URL is set, uses it with curl if yes, otherwise curls directly.
|
|
||||||
# -f: Fail silently (exit non-zero on error)
|
|
||||||
# --connect-timeout 10: Timeout for connection phase
|
|
||||||
# > /dev/null: Discard output, we only care about exit code
|
|
||||||
'if [ -n "$PROXY_URL" ]; then '
|
|
||||||
'curl -f --connect-timeout 10 -x "$PROXY_URL" https://ifconfig.co > /dev/null; '
|
|
||||||
'else '
|
|
||||||
'curl -f --connect-timeout 10 https://ifconfig.co > /dev/null; '
|
|
||||||
'fi'
|
|
||||||
],
|
|
||||||
'interval': 30 * 1000000000, # Check every 30 seconds (30 * 1e9 nanoseconds)
|
|
||||||
'timeout': 15 * 1000000000, # Timeout after 15 seconds (15 * 1e9 nanoseconds)
|
|
||||||
'retries': 5, # Retry 5 times on failure
|
|
||||||
'start_period': 15 * 1000000000 # Grace period of 15 seconds after start
|
|
||||||
},
|
|
||||||
# Add labels for better identification
|
|
||||||
labels={
|
|
||||||
'service': 'ytdlp',
|
|
||||||
'account_id': account_id
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Wait for container to be running (skip health check verification)
|
|
||||||
start_time = time.time()
|
|
||||||
while True:
|
|
||||||
container.reload()
|
|
||||||
if container.status == 'running':
|
|
||||||
break
|
|
||||||
if time.time() - start_time > 10: # 10 second timeout
|
|
||||||
raise TimeoutError("Container failed to start within 10 seconds")
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
logging.info(f"Container started: {container.id} (health check verification skipped)")
|
|
||||||
# Push container details immediately after creation using simplified keys
|
|
||||||
context['task_instance'].xcom_push(key='container_id', value=container.id)
|
|
||||||
context['task_instance'].xcom_push(key='container_name', value=container_name)
|
|
||||||
logging.info(f"Pushed container_id={container.id} and container_name={container_name} to XCom.")
|
|
||||||
|
|
||||||
# --- Determine Host for Sensor ---
|
|
||||||
# Get the container's IP address on the specified network for the HttpSensor
|
|
||||||
try:
|
|
||||||
container.reload() # Refresh container attributes
|
|
||||||
network_settings = container.attrs.get('NetworkSettings', {}).get('Networks', {})
|
|
||||||
if network_name in network_settings:
|
|
||||||
host_for_sensor = network_settings[network_name].get('IPAddress')
|
|
||||||
if not host_for_sensor:
|
|
||||||
raise ValueError(f"Container {container.id} has no IPAddress on network '{network_name}'")
|
|
||||||
logging.info(f"Using container IP '{host_for_sensor}' on network '{network_name}' for HttpSensor.")
|
|
||||||
else:
|
|
||||||
# Fallback or error if container not on expected network
|
|
||||||
logging.error(f"Container {container.id} is not attached to the expected network '{network_name}'. Network settings: {network_settings}")
|
|
||||||
# Option 1: Fallback to container name (might fail as observed)
|
|
||||||
# host_for_sensor = container_name
|
|
||||||
# logging.warning(f"Falling back to container name '{host_for_sensor}' for sensor.")
|
|
||||||
# Option 2: Raise error
|
|
||||||
raise ValueError(f"Container {container.id} not found on network '{network_name}'. Cannot determine IP for sensor.")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to get container IP address: {e}", exc_info=True)
|
|
||||||
raise AirflowException(f"Failed to determine IP address for HttpSensor: {e}")
|
|
||||||
|
|
||||||
# Ensure we don't use 0.0.0.0 or empty string for the sensor
|
|
||||||
if not host_for_sensor or host_for_sensor == "0.0.0.0":
|
|
||||||
raise ValueError(f"Determined host_for_sensor is invalid ('{host_for_sensor}'). Check container network attachment and IP assignment.")
|
|
||||||
|
|
||||||
# --- Add extra logging before pushing ---
|
|
||||||
logging.info(f"FINAL CHECK before XCom push:")
|
|
||||||
logging.info(f" Account ID: {account_id}")
|
|
||||||
logging.info(f" Host for Sensor (IP Address): {host_for_sensor}")
|
|
||||||
logging.info(f" Host for Registration: {host_for_registration}")
|
|
||||||
logging.info(f" Service Port: {port}")
|
|
||||||
logging.info(f" Health Port: {health_port}")
|
|
||||||
logging.info(f" Pushing to XCom key: service_host with value: {host_for_sensor}")
|
|
||||||
# --- End extra logging ---
|
|
||||||
|
|
||||||
# Push distinct service connection details using simplified keys
|
|
||||||
context['task_instance'].xcom_push(key='service_host_registration', value=host_for_registration) # For client discovery (e.g., Redis)
|
|
||||||
context['task_instance'].xcom_push(key='service_host', value=host_for_sensor) # IP Address for HttpSensor
|
|
||||||
context['task_instance'].xcom_push(key='service_port', value=port) # Port is the same
|
|
||||||
context['task_instance'].xcom_push(key='service_health_port', value=health_port) # Health port is the same
|
|
||||||
logging.info(f"Pushed host_for_sensor (IP Address)={host_for_sensor} to XCom key 'service_host'")
|
|
||||||
logging.info(f"Pushed host_for_registration={host_for_registration} to XCom key 'service_host_registration'")
|
|
||||||
|
|
||||||
|
|
||||||
# Store account metadata in Redis only if redis_enabled is True
|
|
||||||
# This uses the 'host_for_registration' for client discovery
|
|
||||||
if redis_enabled:
|
|
||||||
store_account_metadata(account_id, host_for_registration, port, proxy, health_port, container.id)
|
|
||||||
|
|
||||||
# If we reach here, deployment is considered successful for now
|
|
||||||
logging.info("Deployment preparation successful.")
|
|
||||||
# Return values are implicitly pushed to XCom (but we pushed explicitly above)
|
|
||||||
return context_dir, host_for_registration, port
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error during service deployment: {e}", exc_info=True)
|
|
||||||
# Attempt to cleanup the container if it was created before the error
|
|
||||||
try:
|
|
||||||
if 'container' in locals() and container and container.id:
|
|
||||||
logging.warning(f"Attempting to stop and remove container {container.id} due to deployment error.")
|
|
||||||
container.stop(timeout=5)
|
|
||||||
container.remove(force=True)
|
|
||||||
logging.info(f"Successfully stopped and removed container {container.id} after error.")
|
|
||||||
elif 'container_name' in locals() and container_name:
|
|
||||||
# Try finding by name if ID wasn't captured
|
|
||||||
containers = client.containers.list(filters={'name': container_name})
|
|
||||||
if containers:
|
|
||||||
logging.warning(f"Attempting to stop and remove container {containers[0].name} by name due to deployment error.")
|
|
||||||
containers[0].stop(timeout=5)
|
|
||||||
containers[0].remove(force=True)
|
|
||||||
logging.info(f"Successfully stopped and removed container {containers[0].name} after error.")
|
|
||||||
except Exception as cleanup_err:
|
|
||||||
logging.error(f"Failed during post-error container cleanup: {cleanup_err}")
|
|
||||||
raise # Re-raise the original exception to fail the task
|
|
||||||
|
|
||||||
# Removed the old monitor_health PythonOperator
|
|
||||||
|
|
||||||
# stop_service and cleanup_service are now defined directly in the DAG below.
|
|
||||||
|
|
||||||
def check_service_health(ti=None, **context):
|
|
||||||
"""
|
|
||||||
Periodically checks the service's /health endpoint using requests.
|
|
||||||
Acts as a long-running sentinel task. Fails if the health check fails
|
|
||||||
repeatedly or times out.
|
|
||||||
"""
|
|
||||||
# Get parameters from XCom
|
|
||||||
host_reg = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_host_registration')
|
|
||||||
host_svc = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_host')
|
|
||||||
health_port = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_health_port')
|
|
||||||
|
|
||||||
# Determine the host to use (prioritize registration host)
|
|
||||||
host = host_reg if host_reg and host_reg != '0.0.0.0' else host_svc
|
|
||||||
if not host or not health_port:
|
|
||||||
raise AirflowException("Could not retrieve host or health_port from XCom for health check.")
|
|
||||||
|
|
||||||
health_url = f"http://{host}:{health_port}/health"
|
|
||||||
logging.info(f"Starting health check for: {health_url}")
|
|
||||||
|
|
||||||
# Get configuration for polling
|
|
||||||
# Use task's execution_timeout if available, otherwise default to 1 year
|
|
||||||
task_timeout = ti.task.execution_timeout or timedelta(days=365)
|
|
||||||
poke_interval = 60 # Check every 60 seconds (adjust as needed)
|
|
||||||
start_time = time.monotonic()
|
|
||||||
timeout_seconds = task_timeout.total_seconds()
|
|
||||||
consecutive_error_start_time = None # Track start time of consecutive connection errors
|
|
||||||
error_retry_window = 10 # Seconds to retry connection errors before failing
|
|
||||||
|
|
||||||
while True:
|
|
||||||
current_time = time.monotonic()
|
|
||||||
if current_time - start_time > timeout_seconds:
|
|
||||||
raise AirflowException(f"Health check timed out after {timeout_seconds} seconds for {health_url}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Use a reasonable timeout for the individual request
|
|
||||||
response = requests.get(health_url, timeout=15) # 15 second request timeout
|
|
||||||
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
|
|
||||||
|
|
||||||
# Check response content if needed (optional)
|
|
||||||
# Example: Check for specific JSON content
|
|
||||||
# try:
|
|
||||||
# data = response.json()
|
|
||||||
# if data.get("status") == "healthy":
|
|
||||||
# logging.info(f"Health check successful: Status {response.status_code}")
|
|
||||||
# else:
|
|
||||||
# logging.warning(f"Health check OK (Status {response.status_code}), but content unexpected: {data}")
|
|
||||||
# except requests.exceptions.JSONDecodeError:
|
|
||||||
# logging.warning(f"Health check OK (Status {response.status_code}), but response is not valid JSON.")
|
|
||||||
|
|
||||||
# If we got a 2xx status, log success and reset error timer if needed
|
|
||||||
if consecutive_error_start_time is not None:
|
|
||||||
logging.info(f"Connection to {health_url} recovered.")
|
|
||||||
consecutive_error_start_time = None
|
|
||||||
logging.info(f"Health check successful: Status {response.status_code} for {health_url}")
|
|
||||||
|
|
||||||
except requests.exceptions.Timeout:
|
|
||||||
current_monotonic_time = time.monotonic()
|
|
||||||
if consecutive_error_start_time is None:
|
|
||||||
consecutive_error_start_time = current_monotonic_time
|
|
||||||
logging.warning(f"Health check request timed out for {health_url}. Starting {error_retry_window}s retry window...")
|
|
||||||
else:
|
|
||||||
elapsed_error_time = current_monotonic_time - consecutive_error_start_time
|
|
||||||
if elapsed_error_time > error_retry_window:
|
|
||||||
error_msg = f"Health check failed for {health_url}: Timeout persisted for over {error_retry_window} seconds."
|
|
||||||
logging.error(error_msg)
|
|
||||||
raise AirflowException(error_msg)
|
|
||||||
else:
|
|
||||||
logging.warning(f"Health check request timed out for {health_url}. Retrying within {error_retry_window}s window ({elapsed_error_time:.1f}s elapsed)...")
|
|
||||||
|
|
||||||
except requests.exceptions.ConnectionError as e:
|
|
||||||
# Check if the error is specifically "Connection refused" - fail immediately
|
|
||||||
if "[Errno 111] Connection refused" in str(e):
|
|
||||||
logging.error(f"Health check failed for {health_url}: Connection refused. Failing task immediately.")
|
|
||||||
raise AirflowException(f"Health check failed for {health_url}: Connection refused")
|
|
||||||
else:
|
|
||||||
# Handle other connection errors with the retry window
|
|
||||||
current_monotonic_time = time.monotonic()
|
|
||||||
if consecutive_error_start_time is None:
|
|
||||||
consecutive_error_start_time = current_monotonic_time
|
|
||||||
logging.warning(f"Health check connection error for {health_url}: {e}. Starting {error_retry_window}s retry window...")
|
|
||||||
else:
|
|
||||||
elapsed_error_time = current_monotonic_time - consecutive_error_start_time
|
|
||||||
if elapsed_error_time > error_retry_window:
|
|
||||||
error_msg = f"Health check failed for {health_url}: Connection error persisted for over {error_retry_window} seconds. Last error: {e}"
|
|
||||||
logging.error(error_msg)
|
|
||||||
raise AirflowException(error_msg)
|
|
||||||
else:
|
|
||||||
logging.warning(f"Health check connection error for {health_url}: {e}. Retrying within {error_retry_window}s window ({elapsed_error_time:.1f}s elapsed)...")
|
|
||||||
|
|
||||||
except requests.exceptions.HTTPError as e:
|
|
||||||
# This catches 4xx/5xx errors - fail immediately
|
|
||||||
logging.error(f"Health check failed for {health_url}: Status {e.response.status_code}. Failing task.")
|
|
||||||
# Fail the task immediately on HTTP error
|
|
||||||
raise AirflowException(f"Health check failed for {health_url}: Status {e.response.status_code}")
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
logging.error(f"Health check failed for {health_url} with unexpected error: {e}. Failing task.")
|
|
||||||
# Fail the task immediately on other request errors
|
|
||||||
raise AirflowException(f"Health check failed for {health_url}: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
# Catch any other unexpected errors during the check
|
|
||||||
logging.error(f"Unexpected error during health check for {health_url}: {e}", exc_info=True)
|
|
||||||
raise AirflowException(f"Unexpected error during health check: {e}")
|
|
||||||
|
|
||||||
# Wait for the poke interval before the next check
|
|
||||||
time.sleep(poke_interval)
|
|
||||||
|
|
||||||
|
|
||||||
def _wait_forever():
|
|
||||||
"""Sleeps indefinitely (or until task timeout) to simulate a running service."""
|
|
||||||
logging.info("Sentinel task started. Sleeping in a loop...")
|
|
||||||
# Sleep in a loop with a reasonable interval to avoid OverflowError
|
|
||||||
# The task will keep running until it times out based on execution_timeout
|
|
||||||
# or is manually stopped/failed.
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
# Sleep for a long interval (e.g., 1 day)
|
|
||||||
# You can adjust this interval if needed.
|
|
||||||
time.sleep(86400) # Sleep for 24 hours
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
logging.info("Sentinel task interrupted. Exiting.")
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
# Log other potential errors during sleep, though unlikely
|
|
||||||
logging.error(f"Error during sentinel sleep loop: {e}")
|
|
||||||
# Optionally break or continue based on error handling strategy
|
|
||||||
break # Exit loop on unexpected error
|
|
||||||
|
|
||||||
def stop_service(**context):
|
|
||||||
"""Stop the running Docker container with verification."""
|
|
||||||
# Retrieve account_id from params or kwargs
|
|
||||||
account_id = context.get('params', {}).get('account_id') or context.get('account_id')
|
|
||||||
if not account_id:
|
|
||||||
raise ValueError("Account ID is missing.")
|
|
||||||
|
|
||||||
# Initialize Docker client to connect to docker-socket-proxy
|
|
||||||
client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
|
|
||||||
|
|
||||||
try:
|
|
||||||
# For testing, try to get container ID from environment if XCom is not available
|
|
||||||
container_id = None
|
|
||||||
if 'ti' in context:
|
|
||||||
# Use simplified XCom key
|
|
||||||
container_id = context['ti'].xcom_pull(task_ids='prepare_and_deploy', key='container_id')
|
|
||||||
|
|
||||||
if not container_id:
|
|
||||||
# If not found in XCom, try to find container by account_id pattern (keep this fallback)
|
|
||||||
containers = client.containers.list(filters={"name": f"ytdlp_service_{account_id}"})
|
|
||||||
if containers:
|
|
||||||
container = containers[0]
|
|
||||||
container_id = container.id
|
|
||||||
logging.info(f"Found container by name pattern: {container.name} (ID: {container_id})")
|
|
||||||
else:
|
|
||||||
logging.warning(f"No container found for account {account_id} - nothing to stop")
|
|
||||||
return
|
|
||||||
|
|
||||||
if container_id:
|
|
||||||
# If found in XCom, stop by container ID
|
|
||||||
container = client.containers.get(container_id)
|
|
||||||
|
|
||||||
# Verify container is running before stopping
|
|
||||||
if container.status != 'running':
|
|
||||||
logging.warning(f"Container {container_id} is not running (status: {container.status})")
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.info(f"Stopping container {container_id}...")
|
|
||||||
container.stop(timeout=10) # 10 second timeout
|
|
||||||
|
|
||||||
# Verify container is stopped
|
|
||||||
container.reload()
|
|
||||||
if container.status == 'exited':
|
|
||||||
logging.info(f"Successfully stopped container {container_id}")
|
|
||||||
else:
|
|
||||||
logging.error(f"Container {container_id} failed to stop (status: {container.status})")
|
|
||||||
raise RuntimeError(f"Container {container_id} failed to stop")
|
|
||||||
|
|
||||||
# Clear Redis entries only if redis_enabled is True
|
|
||||||
# Retrieve redis_enabled status from DAG run conf or params
|
|
||||||
redis_enabled = context['dag_run'].conf.get('redis_enabled', False) or context['params'].get('redis_enabled', False)
|
|
||||||
if redis_enabled:
|
|
||||||
redis_client = get_redis_connection()
|
|
||||||
try:
|
|
||||||
# Verify Redis connection
|
|
||||||
if not redis_client.ping():
|
|
||||||
raise ConnectionError("Failed to connect to Redis")
|
|
||||||
|
|
||||||
# Remove main metadata
|
|
||||||
redis_client.delete(f"ytdlp:{account_id}")
|
|
||||||
# Remove from accounts set
|
|
||||||
redis_client.srem("ytdlp:accounts", account_id)
|
|
||||||
logging.info(f"Successfully cleared Redis entries for account: {account_id}")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to clear Redis entries for account {account_id}: {e}")
|
|
||||||
# Do not raise here, allow container stop to be considered successful
|
|
||||||
# raise # Optional: re-raise if Redis cleanup failure should fail the task
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.warning(f"No container found for account {account_id} - nothing to stop")
|
|
||||||
|
|
||||||
except docker.errors.NotFound as e:
|
|
||||||
logging.warning(f"Container for account {account_id} not found: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to stop container: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup_service(**context):
|
|
||||||
"""Cleanup service resources including Redis entries and XCom data."""
|
|
||||||
# Note: This function is now called within the manual_stop_cleanup TaskGroup
|
|
||||||
try:
|
|
||||||
# Retrieve account_id from params first, then from XCom
|
|
||||||
account_id = context['params'].get('account_id')
|
|
||||||
if not account_id:
|
|
||||||
# Try to get it from XCom
|
|
||||||
account_id = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='account_id')
|
|
||||||
if not account_id:
|
|
||||||
logging.warning("Account ID not found in params or XCom - skipping resource cleanup")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Redis cleanup (if redis_enabled=True) is handled in the 'stop_service' task.
|
|
||||||
logging.info(f"Redis cleanup for account {account_id} is handled by the 'stop_service' task if enabled.")
|
|
||||||
|
|
||||||
# Cleanup XCom data (using simplified keys where applicable)
|
|
||||||
# Note: XCom cleanup is generally not strictly necessary but can be good practice.
|
|
||||||
# Airflow manages XCom expiry. This code doesn't actually *delete* XComs.
|
|
||||||
# To truly delete, you'd use the Airflow API or DB directly.
|
|
||||||
# We'll leave the pull calls here as they don't harm anything.
|
|
||||||
ti = context['task_instance']
|
|
||||||
ti.xcom_pull(key='container_id', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='container_name', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='service_host_registration', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='service_host', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='service_port', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='service_health_port', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='work_id', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='context_dir', task_ids='prepare_and_deploy', include_prior_dates=True)
|
|
||||||
ti.xcom_pull(key='account_id', task_ids='prepare_and_deploy', include_prior_dates=True) # Keep account_id pull
|
|
||||||
logging.info(f"Pulled XCom data for potential cleanup logging for account: {account_id}")
|
|
||||||
|
|
||||||
# Initialize Docker client
|
|
||||||
client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
|
|
||||||
container_found_and_removed = False
|
|
||||||
|
|
||||||
# Attempt 1: Get container ID from XCom using simplified key
|
|
||||||
container_id_xcom = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='container_id')
|
|
||||||
if container_id_xcom:
|
|
||||||
logging.info(f"Attempting to remove container using XCom ID: {container_id_xcom}")
|
|
||||||
try:
|
|
||||||
container = client.containers.get(container_id_xcom)
|
|
||||||
logging.info(f"Found container {container.id} (Name: {container.name}). Removing...")
|
|
||||||
container.remove(force=True)
|
|
||||||
logging.info(f"Successfully removed container {container.id}")
|
|
||||||
container_found_and_removed = True
|
|
||||||
except docker.errors.NotFound:
|
|
||||||
logging.warning(f"Container with XCom ID {container_id_xcom} not found. Trying other methods.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error removing container {container_id_xcom}: {e}")
|
|
||||||
|
|
||||||
# Attempt 2: Find container by labels if not found/removed via XCom ID
|
|
||||||
if not container_found_and_removed:
|
|
||||||
logging.info(f"Attempting to find and remove container by labels: service=ytdlp, account_id={account_id}")
|
|
||||||
try:
|
|
||||||
containers = client.containers.list(
|
|
||||||
filters={'label': [f'service=ytdlp', f'account_id={account_id}']},
|
|
||||||
all=True # Include stopped containers
|
|
||||||
)
|
|
||||||
if containers:
|
|
||||||
for container in containers:
|
|
||||||
logging.info(f"Found container {container.id} (Name: {container.name}) by labels. Removing...")
|
|
||||||
try:
|
|
||||||
container.remove(force=True)
|
|
||||||
logging.info(f"Successfully removed container {container.id}")
|
|
||||||
container_found_and_removed = True # Mark as found even if only one is removed
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error removing container {container.id} found by labels: {e}")
|
|
||||||
else:
|
|
||||||
logging.info("No containers found matching labels.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error searching for containers by labels: {e}")
|
|
||||||
|
|
||||||
# Attempt 3: Find container by name pattern if still not found/removed
|
|
||||||
if not container_found_and_removed:
|
|
||||||
container_name_pattern = f"ytdlp_service_{account_id}_*"
|
|
||||||
logging.info(f"Attempting to find and remove container by name pattern: {container_name_pattern}")
|
|
||||||
try:
|
|
||||||
containers = client.containers.list(filters={'name': container_name_pattern}, all=True)
|
|
||||||
if containers:
|
|
||||||
for container in containers:
|
|
||||||
logging.info(f"Found container {container.id} (Name: {container.name}) by name pattern. Removing...")
|
|
||||||
try:
|
|
||||||
container.remove(force=True)
|
|
||||||
logging.info(f"Successfully removed container {container.id}")
|
|
||||||
container_found_and_removed = True
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error removing container {container.id} found by name: {e}")
|
|
||||||
else:
|
|
||||||
logging.info("No containers found matching name pattern.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error searching for containers by name: {e}")
|
|
||||||
|
|
||||||
if not container_found_and_removed:
|
|
||||||
logging.warning(f"Could not find or remove any container for account {account_id} using ID, labels, or name.")
|
|
||||||
|
|
||||||
# Get context directory from XCom and remove it
|
|
||||||
context_dir = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='context_dir')
|
|
||||||
if context_dir and os.path.exists(context_dir):
|
|
||||||
shutil.rmtree(context_dir)
|
|
||||||
logging.info(f"Cleaned up working directory: {context_dir}")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error during cleanup: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Define the DAG
|
|
||||||
with DAG(
|
|
||||||
'ytdlp_service',
|
|
||||||
default_args=default_args,
|
|
||||||
description='Deploy YTDLP token service for ios, android, mweb',
|
|
||||||
schedule_interval=None,
|
|
||||||
start_date=days_ago(1), # Use dynamic start date for manually triggered DAG
|
|
||||||
catchup=False,
|
|
||||||
tags=['youtube', 'tokens', 'service', 'docker'],
|
|
||||||
# executor_config moved to default_args
|
|
||||||
is_paused_upon_creation=False,
|
|
||||||
params={
|
|
||||||
'account_id': Param(
|
|
||||||
'account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09',
|
|
||||||
type="string",
|
|
||||||
description="Required: The account ID for which the service is being deployed."
|
|
||||||
),
|
|
||||||
'proxy': Param(
|
|
||||||
'socks5://sslocal-rust-1084:1084',
|
|
||||||
type=["null", "string"],
|
|
||||||
description="Optional: The SOCKS5 proxy URL to use for the service (e.g., socks5://host:port)."
|
|
||||||
),
|
|
||||||
'clients': Param(
|
|
||||||
'ios,android,mweb',
|
|
||||||
type="string",
|
|
||||||
description="Comma-separated list of client types (e.g., ios,android,mweb)."
|
|
||||||
),
|
|
||||||
'redis_enabled': Param(
|
|
||||||
False,
|
|
||||||
type="boolean",
|
|
||||||
description="Use Redis for service discovery? If False, host/port must be provided or will be auto-assigned."
|
|
||||||
),
|
|
||||||
'host': Param(
|
|
||||||
None,
|
|
||||||
type=["null", "string"],
|
|
||||||
description="Optional: Host IP for the service. If redis_enabled=False and host is not provided, defaults to '0.0.0.0'. If redis_enabled=True and host is not provided, uses HOST_EXTERNAL_IP or defaults to '0.0.0.0'."
|
|
||||||
),
|
|
||||||
'port': Param(
|
|
||||||
None,
|
|
||||||
type=["null", "integer"],
|
|
||||||
description="Optional: Port for the service. If None, a free port will be assigned automatically. If redis_enabled=False and a port is provided, it will be used (after checking availability)."
|
|
||||||
),
|
|
||||||
# redis_host and redis_port parameters are removed.
|
|
||||||
# If redis_enabled=True, the DAG will use the 'redis_default' Airflow connection.
|
|
||||||
'docker_network': Param(
|
|
||||||
'airflow_prod_proxynet',
|
|
||||||
type="string",
|
|
||||||
description="Optional: The Docker network to attach the container to. Defaults to 'airflow_prod_proxynet'."
|
|
||||||
),
|
|
||||||
'exit_on_proxy_fail': Param(
|
|
||||||
True,
|
|
||||||
type="boolean",
|
|
||||||
description="Exit the service container immediately if the initial proxy test fails?"
|
|
||||||
),
|
|
||||||
}
|
|
||||||
) as dag:
|
|
||||||
|
|
||||||
# Task to prepare and deploy the service
|
|
||||||
prepare_and_deploy = PythonOperator(
|
|
||||||
task_id='prepare_and_deploy',
|
|
||||||
python_callable=prepare_and_deploy_service,
|
|
||||||
provide_context=True,
|
|
||||||
trigger_rule='all_success' # Keep default trigger rule for prepare_and_deploy
|
|
||||||
)
|
|
||||||
|
|
||||||
# Combined Health Check and Sentinel Task using PythonOperator
|
|
||||||
# This task runs for a long time, checking health periodically using the 'requests' library.
|
|
||||||
# If the health check fails repeatedly or times out, the task fails, triggering 'stop_service'.
|
|
||||||
monitor_service_health = PythonOperator(
|
|
||||||
task_id='monitor_service_health',
|
|
||||||
python_callable=check_service_health,
|
|
||||||
provide_context=True,
|
|
||||||
# Set execution timeout for the task itself (acts as the overall timeout)
|
|
||||||
execution_timeout=timedelta(days=365), # Long timeout (e.g., 1 year)
|
|
||||||
# op_kwargs can pass static config, but host/port come from XCom inside the function
|
|
||||||
# poke_interval and request timeout are handled within check_service_health
|
|
||||||
)
|
|
||||||
monitor_service_health.doc_md = """
|
|
||||||
### Monitor Service Health Task (PythonOperator)
|
|
||||||
Uses a Python function to periodically check the service's `/health` endpoint using the `requests` library.
|
|
||||||
Acts as both a health check and a sentinel for the running service.
|
|
||||||
- **Pulls from XCom:** Reads `service_host_registration`, `service_host`, and `service_health_port` from the `prepare_and_deploy` task to construct the target URL.
|
|
||||||
- **Polling:** Checks the `/health` endpoint every 60 seconds.
|
|
||||||
- **Timeout:** Uses the task's `execution_timeout` (set to 1 year) as the overall maximum duration. Individual requests have a 15-second timeout.
|
|
||||||
- **Failure:** If a health check request returns a 4xx/5xx status code or encounters other request errors, the task fails immediately. If the overall `execution_timeout` is reached without a failure, the task would eventually time out and fail.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Task to stop the service (runs if monitor_service_health fails)
|
|
||||||
stop = PythonOperator(
|
|
||||||
task_id='stop_service',
|
|
||||||
python_callable=stop_service,
|
|
||||||
provide_context=True,
|
|
||||||
trigger_rule=TriggerRule.ONE_FAILED # Run only if monitor_service_health fails
|
|
||||||
)
|
|
||||||
stop.doc_md = """
|
|
||||||
### Stop Service Task
|
|
||||||
Stops the Docker container associated with the service.
|
|
||||||
- **Trigger Rule:** `one_failed` - This task only runs if the upstream `monitor_service_health` task fails.
|
|
||||||
- Pulls container ID/name from XCom or finds it using labels/name patterns.
|
|
||||||
- Clears Redis entries if `redis_enabled=True`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Marker task to indicate that the deployment failed
|
|
||||||
prepare_failed_marker = EmptyOperator(
|
|
||||||
task_id='prepare_failed_marker',
|
|
||||||
trigger_rule=TriggerRule.ONE_FAILED # Run only if 'prepare_and_deploy' fails
|
|
||||||
)
|
|
||||||
|
|
||||||
# Task to cleanup resources (runs after stop sequence OR if prepare fails)
|
|
||||||
cleanup = PythonOperator(
|
|
||||||
task_id='cleanup_service',
|
|
||||||
python_callable=cleanup_service,
|
|
||||||
provide_context=True,
|
|
||||||
trigger_rule=TriggerRule.ALL_DONE # Run after upstream (stop or prepare_failed_marker) is done
|
|
||||||
)
|
|
||||||
cleanup.doc_md = """
|
|
||||||
### Cleanup Service Task
|
|
||||||
Removes the Docker container and cleans up related resources.
|
|
||||||
- **Trigger Rule:** `all_done` - Runs after the `stop_service` task finishes, whether it succeeded or failed.
|
|
||||||
- Removes the container using ID from XCom, labels, or name patterns.
|
|
||||||
- Cleans up XCom variables.
|
|
||||||
- Removes the context directory.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Define task dependencies
|
|
||||||
# Success Path: prepare -> monitor (runs indefinitely)
|
|
||||||
# Monitor Failure Path: monitor (fails) -> stop -> cleanup
|
|
||||||
# Prepare Failure Path: prepare (fails) -> prepare_failed_marker -> cleanup
|
|
||||||
|
|
||||||
prepare_and_deploy >> monitor_service_health
|
|
||||||
prepare_and_deploy >> prepare_failed_marker # Trigger marker if prepare fails
|
|
||||||
|
|
||||||
monitor_service_health >> stop # Trigger stop if monitor fails
|
|
||||||
|
|
||||||
# Cleanup is triggered after stop finishes OR after prepare_failed_marker finishes
|
|
||||||
stop >> cleanup
|
|
||||||
prepare_failed_marker >> cleanup
|
|
||||||
|
|
||||||
BIN
airflow/dags/.DS_Store
vendored
BIN
airflow/dags/.DS_Store
vendored
Binary file not shown.
@ -1,88 +0,0 @@
|
|||||||
# Архитектура и описание YTDLP Airflow DAGs
|
|
||||||
|
|
||||||
Этот документ описывает архитектуру и назначение DAG'ов, используемых для скачивания видео с YouTube. Система построена на модели непрерывного, самоподдерживающегося цикла для параллельной и отказоустойчивой обработки.
|
|
||||||
|
|
||||||
## Основной цикл обработки
|
|
||||||
|
|
||||||
Обработка выполняется двумя основными DAG'ами, которые работают в паре: оркестратор и воркер.
|
|
||||||
|
|
||||||
### `ytdlp_ops_orchestrator` (Система "зажигания")
|
|
||||||
|
|
||||||
- **Назначение:** Этот DAG действует как "система зажигания" для запуска обработки. Он запускается вручную для старта указанного количества параллельных циклов-воркеров.
|
|
||||||
- **Принцип работы:**
|
|
||||||
- Он **не** обрабатывает URL-адреса самостоятельно.
|
|
||||||
- Его единственная задача — запустить сконфигурированное количество DAG'ов `ytdlp_ops_worker_per_url`.
|
|
||||||
- Он передает всю необходимую конфигурацию (пул аккаунтов, подключение к Redis и т.д.) воркерам.
|
|
||||||
|
|
||||||
### `ytdlp_ops_worker_per_url` (Самоподдерживающийся воркер)
|
|
||||||
|
|
||||||
- **Назначение:** Этот DAG обрабатывает один URL и спроектирован для работы в непрерывном цикле.
|
|
||||||
- **Принцип работы:**
|
|
||||||
1. **Запуск:** Начальный запуск инициируется `ytdlp_ops_orchestrator`.
|
|
||||||
2. **Получение задачи:** Воркер извлекает один URL из очереди `_inbox` в Redis. Если очередь пуста, выполнение воркера завершается, и его "линия" обработки останавливается.
|
|
||||||
3. **Обработка:** Он взаимодействует с сервисом `ytdlp-ops-server` для получения `info.json` и прокси, после чего скачивает видео.
|
|
||||||
4. **Продолжение или остановка:**
|
|
||||||
- **В случае успеха:** Он запускает новый экземпляр самого себя, создавая непрерывный цикл для обработки следующего URL.
|
|
||||||
- **В случае сбоя:** Цикл прерывается (если `stop_on_failure` установлено в `True`), останавливая эту "линию" обработки. Это предотвращает остановку всей системы из-за одного проблемного URL или аккаунта.
|
|
||||||
|
|
||||||
## Управляющие DAG'и
|
|
||||||
|
|
||||||
### `ytdlp_mgmt_proxy_account`
|
|
||||||
|
|
||||||
- **Назначение:** Это основной инструмент для мониторинга и управления состоянием ресурсов, используемых `ytdlp-ops-server`.
|
|
||||||
- **Функциональность:**
|
|
||||||
- **Просмотр статусов:** Позволяет увидеть текущий статус всех прокси и аккаунтов (например, `ACTIVE`, `BANNED`, `RESTING`).
|
|
||||||
- **Управление прокси:** Позволяет вручную банить, разбанивать или сбрасывать статус прокси.
|
|
||||||
- **Управление аккаунтами:** Позволяет вручную банить или разбанивать аккаунты.
|
|
||||||
|
|
||||||
### `ytdlp_mgmt_queues`
|
|
||||||
|
|
||||||
- **Назначение:** Предоставляет набор инструментов для управления очередями Redis, используемыми в конвейере обработки.
|
|
||||||
- **Функциональность (через параметр `action`):**
|
|
||||||
- `add_videos`: Добавление одного или нескольких URL-адресов YouTube в очередь.
|
|
||||||
- `clear_queue`: Очистка (удаление) указанного ключа Redis.
|
|
||||||
- `list_contents`: Просмотр содержимого ключа Redis (списка или хэша).
|
|
||||||
- `check_status`: Проверка общего состояния очередей (тип, размер).
|
|
||||||
- `requeue_failed`: Перемещение всех URL-адресов из очереди сбоев `_fail` обратно в очередь `_inbox` для повторной обработки.
|
|
||||||
|
|
||||||
## Стратегия управления ресурсами (Прокси и Аккаунты)
|
|
||||||
|
|
||||||
Система использует интеллектуальную стратегию для управления жизненным циклом и состоянием аккаунтов и прокси, чтобы максимизировать процент успеха и минимизировать блокировки.
|
|
||||||
|
|
||||||
- **Жизненный цикл аккаунта ("Cooldown"):**
|
|
||||||
- Чтобы предотвратить "выгорание", аккаунты автоматически переходят в состояние "отдыха" (`RESTING`) после периода интенсивного использования.
|
|
||||||
- По истечении периода отдыха они автоматически возвращаются в `ACTIVE` и снова становятся доступными для воркеров.
|
|
||||||
|
|
||||||
- **Умная стратегия банов:**
|
|
||||||
- **Сначала бан аккаунта:** При возникновении серьезной ошибки (например, `BOT_DETECTED`) система наказывает **только аккаунт**, который вызвал сбой. Прокси при этом продолжает работать.
|
|
||||||
- **Бан прокси по "скользящему окну":** Прокси банится автоматически, только если он демонстрирует **систематические сбои с РАЗНЫМИ аккаунтами** за короткий промежуток времени. Это является надежным индикатором того, что проблема именно в прокси.
|
|
||||||
|
|
||||||
- **Мониторинг:**
|
|
||||||
- DAG `ytdlp_mgmt_proxy_account` является основным инструментом для мониторинга. Он показывает текущий статус всех ресурсов, включая время, оставшееся до активации забаненных или отдыхающих аккаунтов.
|
|
||||||
- Граф выполнения DAG `ytdlp_ops_worker_per_url` теперь явно показывает шаги, такие как `assign_account`, `get_token`, `ban_account`, `retry_get_token`, что делает процесс отладки более наглядным.
|
|
||||||
|
|
||||||
## Внешние сервисы
|
|
||||||
|
|
||||||
### `ytdlp-ops-server` (Thrift Service)
|
|
||||||
|
|
||||||
- **Назначение:** Внешний сервис, который предоставляет аутентификационные данные (токены, cookies, proxy) для скачивания видео.
|
|
||||||
- **Взаимодействие:** Worker DAG (`ytdlp_ops_worker_per_url`) обращается к этому сервису перед началом загрузки для получения необходимых данных для `yt-dlp`.
|
|
||||||
|
|
||||||
## Логика работы Worker DAG (`ytdlp_ops_worker_per_url`)
|
|
||||||
|
|
||||||
Этот DAG является "рабочей лошадкой" системы. Он спроектирован как самоподдерживающийся цикл для обработки одного URL за запуск.
|
|
||||||
|
|
||||||
### Задачи и их назначение:
|
|
||||||
|
|
||||||
- **`pull_url_from_redis`**: Извлекает один URL из очереди `_inbox` в Redis. Если очередь пуста, DAG завершается со статусом `skipped`, останавливая эту "линию" обработки.
|
|
||||||
- **`assign_account`**: Выбирает аккаунт для выполнения задачи. Он будет повторно использовать тот же аккаунт, который был успешно использован в предыдущем запуске в своей "линии" (привязка аккаунта). Если это первый запуск, он выбирает случайный аккаунт.
|
|
||||||
- **`get_token`**: Основная задача. Она обращается к `ytdlp-ops-server` для получения `info.json`.
|
|
||||||
- **`handle_bannable_error_branch`**: Если `get_token` завершается с ошибкой, требующей бана, эта задача-развилка решает, что делать дальше, в зависимости от политики `on_bannable_failure`.
|
|
||||||
- **`ban_account_and_prepare_for_retry`**: Если политика разрешает повтор, эта задача банит сбойный аккаунт и выбирает новый для повторной попытки.
|
|
||||||
- **`retry_get_token`**: Выполняет вторую попытку получить токен с новым аккаунтом.
|
|
||||||
- **`ban_second_account_and_proxy`**: Если и вторая попытка неудачна, эта задача банит второй аккаунт и использованный прокси.
|
|
||||||
- **`download_and_probe`**: Если `get_token` (или `retry_get_token`) завершилась успешно, эта задача использует `yt-dlp` для скачивания медиа и `ffmpeg` для проверки целостности скачанного файла.
|
|
||||||
- **`mark_url_as_success`**: Если `download_and_probe` завершилась успешно, эта задача записывает результат в хэш `_result` в Redis.
|
|
||||||
- **`handle_generic_failure`**: Если любая из основных задач завершается с неисправимой ошибкой, эта задача записывает подробную информацию об ошибке в хэш `_fail` в Redis.
|
|
||||||
- **`decide_what_to_do_next`**: Задача-развилка, которая запускается после успеха или неудачи. Она решает, продолжать ли цикл.
|
|
||||||
- **`trigger_self_run`**: Задача, которая фактически запускает следующий экземпляр DAG, создавая непрерывный цикл.
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,23 +0,0 @@
|
|||||||
import socket
|
|
||||||
import logging
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
def get_ip_address():
|
|
||||||
"""
|
|
||||||
Get the primary IP address of the host.
|
|
||||||
This is used by Airflow workers to advertise their IP for log serving,
|
|
||||||
ensuring the webserver can reach them in a multi-host environment.
|
|
||||||
"""
|
|
||||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
||||||
try:
|
|
||||||
# This doesn't even have to be reachable
|
|
||||||
s.connect(('10.255.255.255', 1))
|
|
||||||
ip_address = s.getsockname()[0]
|
|
||||||
logger.info(f"Determined host IP address as: {ip_address}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Could not determine IP address, falling back to 127.0.0.1. Error: {e}")
|
|
||||||
ip_address = '127.0.0.1'
|
|
||||||
finally:
|
|
||||||
s.close()
|
|
||||||
return ip_address
|
|
||||||
@ -1,56 +0,0 @@
|
|||||||
from airflow.plugins_manager import AirflowPlugin
|
|
||||||
from airflow.hooks.base import BaseHook
|
|
||||||
from airflow.configuration import conf
|
|
||||||
import uuid
|
|
||||||
import backoff
|
|
||||||
|
|
||||||
class YTDLPHook(BaseHook):
|
|
||||||
def __init__(self, conn_id='ytdlp_default'):
|
|
||||||
super().__init__()
|
|
||||||
self.conn_id = conn_id
|
|
||||||
self.connection = self.get_connection(conn_id)
|
|
||||||
self.timeout = conf.getint('ytdlp', 'timeout', fallback=120)
|
|
||||||
self.max_retries = conf.getint('ytdlp', 'max_retries', fallback=3)
|
|
||||||
|
|
||||||
@backoff.on_exception(backoff.expo,
|
|
||||||
Exception,
|
|
||||||
max_tries=3,
|
|
||||||
max_time=300)
|
|
||||||
def start_service(self, host, port, service_id, work_dir):
|
|
||||||
"""Start token service as a long-running process"""
|
|
||||||
import subprocess
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# Get script path relative to Airflow home
|
|
||||||
airflow_home = os.getenv('AIRFLOW_HOME', '')
|
|
||||||
script_path = Path(airflow_home).parent / 'ytdlp_ops_server.py'
|
|
||||||
|
|
||||||
# Ensure work directory exists
|
|
||||||
os.makedirs(work_dir, exist_ok=True)
|
|
||||||
|
|
||||||
# Start service process
|
|
||||||
cmd = [
|
|
||||||
'python', str(script_path),
|
|
||||||
'--port', str(port),
|
|
||||||
'--host', host,
|
|
||||||
'--service-id', service_id,
|
|
||||||
'--context-dir', work_dir,
|
|
||||||
'--script-dir', str(Path(airflow_home) / 'dags' / 'scripts')
|
|
||||||
]
|
|
||||||
|
|
||||||
self.log.info(f"Starting token service: {' '.join(cmd)}")
|
|
||||||
|
|
||||||
# Start process detached
|
|
||||||
docker_cmd = [
|
|
||||||
'docker-compose', '-f', 'docker-compose.yaml',
|
|
||||||
'up', '-d', '--build', 'ytdlp-service'
|
|
||||||
]
|
|
||||||
subprocess.run(docker_cmd, check=True)
|
|
||||||
|
|
||||||
self.log.info(f"Token service started on {host}:{port}")
|
|
||||||
return True
|
|
||||||
|
|
||||||
class YTDLPPlugin(AirflowPlugin):
|
|
||||||
name = 'ytdlp_plugin'
|
|
||||||
hooks = [YTDLPHook]
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
2025-04-06 00:41:03,141 - INFO - Attempting to connect to server at 127.0.0.1:9090...
|
|
||||||
2025-04-06 00:41:03,141 - INFO - Successfully connected to server
|
|
||||||
2025-04-06 00:41:03,142 - INFO - Server connection test successful
|
|
||||||
2025-04-06 00:41:03,142 - INFO - Requesting token for URL: https://www.youtube.com/watch?v=sOlTX9uxUtM%27
|
|
||||||
2025-04-06 00:41:17,930 - INFO - Successfully received token data from server
|
|
||||||
2025-04-06 00:41:17,938 - INFO - Valid JSON with video data: Операция "Багратион". От поражения к победе.
|
|
||||||
2025-04-06 00:41:17,944 - INFO - Successfully saved info.json to info_json_sOlTX9uxUtM_1743889277.json and latest.json to latest.json
|
|
||||||
2025-04-06 00:44:05,608 - INFO - Attempting to connect to server at 127.0.0.1:9090...
|
|
||||||
2025-04-06 00:44:05,609 - INFO - Successfully connected to server
|
|
||||||
2025-04-06 00:44:05,609 - INFO - Server connection test successful
|
|
||||||
2025-04-06 00:44:05,610 - INFO - Requesting token for URL: https://www.youtube.com/watch?v=sOlTX9uxUtM%27
|
|
||||||
2025-04-06 00:44:18,350 - INFO - Successfully received token data from server
|
|
||||||
2025-04-06 00:44:18,357 - INFO - Valid JSON with video data: Операция "Багратион". От поражения к победе.
|
|
||||||
2025-04-06 00:44:18,364 - INFO - Successfully saved info.json to info_json_sOlTX9uxUtM_1743889458.json and latest.json to latest.json
|
|
||||||
Loading…
x
Reference in New Issue
Block a user