Remove unused dags

This commit is contained in:
aperez 2025-08-26 19:25:31 +03:00
parent de609aaecd
commit 7b13257bd1
20 changed files with 51 additions and 5311 deletions

36
.aider.chat.history.md Normal file
View File

@ -0,0 +1,36 @@
# aider chat started at 2025-08-19 17:52:04
> Newer aider version v0.86.1 is available.
> /Users/aperez/.local/share/uv/tools/aider-chat/bin/python3 -m pip install --upgrade --upgrade-strategy only-if-needed aider-chat
> Run pip install? (Y)es/(N)o [Yes]: y
> Re-run aider to use new version.
# aider chat started at 2025-08-19 17:55:26
> /Users/aperez/.local/bin/aider --model gemini/gemini-2.5-pro-preview-03-25
> Aider v0.86.1
> Main model: gemini/gemini-2.5-pro-preview-03-25 with diff-fenced edit format
> Weak model: gemini/gemini-2.0-flash
> Git repo: .git with 34 files
> Repo-map: using 4096 tokens, auto refresh
> https://aider.chat/HISTORY.html#release-notes
> Please answer with one of: yes, no, skip, all
> Would you like to see what's new in this version? (Y)es/(N)o [Yes]: n
#### ls
>
>
> ^C again to exit
>
>
> ^C again to exit
#### /read-only ../yt-dlp-deployment/ansible/airflow_worker
> Added 51 files from directory /opt/yt-dlp-dags/../yt-dlp-deployment/ansible/airflow_worker to read-only files.
>
>
> ^C again to exit
>
>
> ^C KeyboardInterrupt

15
.aider.input.history Normal file
View File

@ -0,0 +1,15 @@
# 2025-08-19 17:52:06.094899
+Y
# 2025-08-19 17:55:33.616531
+D
# 2025-08-19 17:55:35.382770
+No
# 2025-08-19 17:55:39.050939
+ls
# 2025-08-19 17:56:18.910148
+/read-only ../yt-dlp-deployment/ansible/airflow_worker

Binary file not shown.

View File

@ -1,23 +0,0 @@
AIRFLOW_IMAGE_NAME=apache/airflow:2.10.4
_AIRFLOW_WWW_USER_USERNAME=airflow
_AIRFLOW_WWW_USER_PASSWORD=airflow-password-ytld
AIRFLOW_UID=50000
AIRFLOW_PROJ_DIR=.
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow-new-super-pass@89.253.221.173:52919/airflow
AIRFLOW__CELERY__RESULT_BACKEND=db+postgresql://airflow:airflow-new-super-pass@89.253.221.173:52919/airflow
AIRFLOW__CELERY__BROKER_URL=redis://:rOhTAIlTFFylXsjhqwxnYxDChFc@89.253.221.173:52909/0
AIRFLOW_QUEUE=holisticlegs-download
AIRFLOW_QUEUE_CHECK=holisticlegs-check
AIRFLOW_QUEUE_UPLOAD=holisticlegs-upload
AIRFLOW__WEBSERVER__SECRET_KEY=8DJ6XbtIICassrVxM9jWV3eTlt5N3XtyEdyW
HOSTNAME=85.192.30.55
AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT=768M
AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV=522M
AIRFLOW_WORKER_DOWNLOAD_CONCURRENCY=2
AIRFLOW_SMALL_WORKERS_MEM_LIMIT=1024M
AIRFLOW_SMALL_WORKERS_MEM_RESERV=512M
~

View File

@ -1,60 +0,0 @@
# This file contains all environment variables for the Airflow-based deployment.
# Copy this file to .env in the same directory and fill in your production values.
# This file is used by `generate_envoy_config.py` and `docker-compose-ytdlp-ops.yaml`.
# --- Common Configuration ---
# A unique name for this server instance, used as a key in Redis.
# This is hardcoded in the docker-compose file but can be overridden here.
SERVER_IDENTITY=ytdlp-ops-airflow-service
# Redis connection details for proxy and account state management.
REDIS_HOST=redis
REDIS_PORT=6379
REDIS_PASSWORD=redis_pwd_K3fG8hJ1mN5pQ2sT
# --- Airflow Database Configuration ---
# The password for the PostgreSQL database used by Airflow.
# This should be a secure, randomly generated password.
POSTGRES_PASSWORD=pgdb_pwd_A7bC2xY9zE1wV5uP
# The password for the Airflow web UI admin user.
AIRFLOW_ADMIN_PASSWORD=admin_pwd_X9yZ3aB1cE5dF7gH
# --- Envoy & Worker Configuration ---
# The public-facing port for the Envoy load balancer that fronts the WORKERS.
ENVOY_PORT=9080
# The port for Envoy's admin/stats interface.
ENVOY_ADMIN_PORT=9901
# The public-facing port for the standalone MANAGEMENT service.
MANAGEMENT_SERVICE_PORT=9091
# The number of Python server workers to run.
# Set to 1 to simplify debugging. Multi-worker mode is experimental.
YTDLP_WORKERS=1
# The starting port for the Python workers. They will use sequential ports (e.g., 9090, 9091, ...).
YTDLP_BASE_PORT=9090
# --- Camoufox (Browser) Configuration ---
# Comma-separated list of SOCKS5 proxies to be used by Camoufox instances.
# Each proxy will get its own dedicated browser instance.
# Example: CAMOUFOX_PROXIES="socks5://user:pass@p.webshare.io:80,socks5://user:pass@p.webshare.io:81"
CAMOUFOX_PROXIES="socks5://your_proxy_user:your_proxy_pass@proxy.example.com:1080,socks5://your_proxy_user:your_proxy_pass@proxy.example.com:1081"
# Password for VNC access to the Camoufox browser instances.
VNC_PASSWORD=vnc_pwd_Z5xW8cV2bN4mP7lK
# The starting port for VNC access. Ports will be assigned sequentially (e.g., 5901, 5902, ...).
CAMOUFOX_BASE_VNC_PORT=5901
# The internal port used by Camoufox for its WebSocket server. Usually does not need to be changed.
CAMOUFOX_PORT=12345
# --- General Proxy Configuration ---
# A general-purpose SOCKS5 proxy that can be used alongside Camoufox proxies.
# This should be the IP address of the proxy server accessible from within the Docker network.
# '172.17.0.1' is often the host IP from within a container.
SOCKS5_SOCK_SERVER_IP=172.17.0.1
# --- Account Manager Configuration ---
# Account cooldown parameters (values are in minutes).
ACCOUNT_ACTIVE_DURATION_MIN=30
ACCOUNT_COOLDOWN_DURATION_MIN=60

File diff suppressed because it is too large Load Diff

View File

@ -1,736 +0,0 @@
import sys
import os
import time
import csv
import json
import logging
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Optional, Dict, Callable, Union
from threading import Event
from PyQt6.QtCore import Qt, QThread, pyqtSignal, QObject, QTimer
from PyQt6.QtWidgets import (
QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
QLabel, QLineEdit, QPushButton, QTextEdit, QSpinBox, QDoubleSpinBox,
QCheckBox, QGroupBox, QGridLayout, QMessageBox, QProgressBar, QDialog,
QComboBox, QFileDialog
)
# Define the current version of this tool.
CURRENT_VERSION = "1.3.0"
class ProxyChecker:
"""
Fetches proxy lists from given URLs and checks if they work.
Supports cancellation, pause/resume, progress reporting, and collects optional detailed
response times, anonymity classification, and geo-location details for working proxies.
"""
def __init__(self,
proxy_urls: Dict[str, str],
timeout: int = 1,
max_retries: int = 3,
retry_delay: float = 1.0,
max_workers: int = 20,
check_url: str = "http://www.google.com",
detailed_results: bool = False,
export_format: str = "txt", # or "csv" or "json"
user_agent: Optional[str] = None,
log_callback: Optional[Callable[[str], None]] = None,
progress_callback: Optional[Callable[[int], None]] = None):
self.proxy_urls = proxy_urls
self.timeout = timeout
self.max_retries = max_retries
self.retry_delay = retry_delay
self.max_workers = max_workers
self.check_url = check_url
self.detailed_results = detailed_results
self.export_format = export_format.lower()
self.user_agent = user_agent
self.log_callback = log_callback
self.progress_callback = progress_callback
self.cancel_event = Event()
self.pause_event = Event() # When set, processing is paused
# Statistics counters
self.total_proxies_checked = 0
self.working_proxies_found = 0
self.overall_total_count = 0
self.overall_processed_count = 0
# Store detailed working results by type.
self.working_results: Dict[str, List[Union[str, Dict[str, Union[str, float, dict]]]]] = {}
self.session = requests.Session()
if self.user_agent:
self.session.headers["User-Agent"] = self.user_agent
# Determine the client IP to help with anonymity detection.
try:
r = requests.get("https://api.ipify.org?format=json", timeout=3)
r.raise_for_status()
self.client_ip = r.json().get("ip")
self.log("info", f"Client IP determined as {self.client_ip}")
except requests.RequestException:
self.client_ip = "unknown"
self.log("warning", "Could not determine client IP for anonymity detection.")
def log(self, level: str, message: str) -> None:
full_message = f"{level.upper()}: {message}"
if self.log_callback:
self.log_callback(full_message)
else:
print(full_message)
def cancel(self) -> None:
self.cancel_event.set()
self.log("info", "Cancellation requested.")
def pause(self) -> None:
self.pause_event.set()
self.log("info", "Proxy checking paused.")
def resume(self) -> None:
self.pause_event.clear()
self.log("info", "Proxy checking resumed.")
def determine_anonymity(self, proxy: str) -> str:
try:
session = requests.Session()
session.proxies = {'http': proxy, 'https': proxy}
r = session.get("https://api.ipify.org?format=json", timeout=self.timeout)
r.raise_for_status()
proxy_ip = r.json().get("ip")
return "transparent" if proxy_ip == self.client_ip else "anonymous"
except requests.RequestException:
return "unknown"
def get_geo_info(self, ip: str) -> dict:
try:
r = requests.get(f"http://ip-api.com/json/{ip}", timeout=3)
r.raise_for_status()
return r.json()
except requests.RequestException:
return {}
def check_proxy(self, proxy: str) -> Optional[Union[str, dict]]:
if self.cancel_event.is_set():
return None
# If paused, wait until resumed.
while self.pause_event.is_set():
time.sleep(0.1)
try:
start = time.time()
session = requests.Session()
session.proxies = {'http': proxy, 'https': proxy}
if self.user_agent:
session.headers["User-Agent"] = self.user_agent
response = session.get(self.check_url, timeout=self.timeout)
elapsed = time.time() - start
if response.status_code == 200:
if self.detailed_results:
anonymity = self.determine_anonymity(proxy)
ip_only = proxy.split(':')[0]
geo = self.get_geo_info(ip_only)
return {
"proxy": proxy,
"response_time": elapsed,
"anonymity": anonymity,
"geo": geo
}
else:
return proxy
except requests.RequestException:
return None
def get_proxies(self, url: str) -> List[str]:
for attempt in range(self.max_retries):
if self.cancel_event.is_set():
self.log("info", "Cancellation detected while fetching proxies.")
return []
try:
response = self.session.get(url, timeout=self.timeout)
response.raise_for_status()
self.log("info", f"Successfully fetched proxies from {url}")
return response.text.strip().splitlines()
except requests.RequestException as e:
self.log("warning", f"Attempt {attempt + 1} failed for {url}: {e}")
time.sleep(self.retry_delay)
self.log("error", f"Failed to retrieve proxies from {url} after {self.max_retries} attempts.")
return []
@staticmethod
def create_proxy_dir(directory: str) -> None:
os.makedirs(directory, exist_ok=True)
def process_proxies(self,
proxy_type: str,
url: Optional[str] = None,
proxies: Optional[List[str]] = None) -> int:
if proxies is None and url is not None:
proxies = self.get_proxies(url)
if self.cancel_event.is_set():
self.log("info", "Cancellation detected before processing proxies.")
return 0
if not proxies:
self.log("warning", f"No proxies to check for {proxy_type}")
return 0
total_proxies = len(proxies)
self.log("info", f"Checking {total_proxies} {proxy_type} proxies with {self.max_workers} workers.")
working_proxy_list = []
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = {executor.submit(self.check_proxy, proxy): proxy for proxy in proxies}
for future in as_completed(futures):
while self.pause_event.is_set():
time.sleep(0.1)
if self.cancel_event.is_set():
self.log("info", "Cancellation detected during proxy checking loop.")
break
result = future.result()
self.overall_processed_count += 1
if self.progress_callback and self.overall_total_count > 0:
progress_percent = int((self.overall_processed_count / self.overall_total_count) * 100)
self.progress_callback(progress_percent)
if result:
working_proxy_list.append(result)
self.working_results[proxy_type] = working_proxy_list
file_ext = ".csv" if self.export_format == "csv" else ".json" if self.export_format == "json" else ".txt"
proxy_file = f'proxies/{proxy_type}{file_ext}'
self.create_proxy_dir(os.path.dirname(proxy_file))
try:
if self.export_format == "csv":
with open(proxy_file, 'w', newline='') as f:
if self.detailed_results:
writer = csv.writer(f)
writer.writerow(["Proxy", "Response Time (s)", "Anonymity", "Country", "Region", "City"])
for item in working_proxy_list:
geo = item.get("geo", {})
writer.writerow([
item.get("proxy"),
f"{item.get('response_time', 0):.2f}",
item.get("anonymity"),
geo.get("country", ""),
geo.get("regionName", ""),
geo.get("city", "")
])
else:
writer = csv.writer(f)
writer.writerow(["Proxy"])
for item in working_proxy_list:
writer.writerow([item])
elif self.export_format == "json":
with open(proxy_file, 'w') as f:
json.dump(working_proxy_list, f, indent=4)
else:
with open(proxy_file, 'w') as f:
if self.detailed_results:
lines = [
f"{item.get('proxy')} - {item.get('response_time'):.2f} s - {item.get('anonymity')} - {item.get('geo', {}).get('country', '')}"
for item in working_proxy_list
]
else:
lines = working_proxy_list
f.write('\n'.join(lines) + '\n')
except OSError as e:
self.log("error", f"Failed to write working proxies to {proxy_file}: {e}")
self.log("info", f"Checked {total_proxies} {proxy_type} proxies. Working: {len(working_proxy_list)}.")
self.total_proxies_checked += total_proxies
self.working_proxies_found += len(working_proxy_list)
return len(working_proxy_list)
def get_statistics(self) -> str:
stats = f"Total proxies checked: {self.total_proxies_checked}\n"
stats += f"Working proxies found: {self.working_proxies_found}\n"
if self.detailed_results:
all_times = []
for lst in self.working_results.values():
all_times.extend([item.get("response_time") for item in lst if isinstance(item, dict)])
if all_times:
avg_time = sum(all_times) / len(all_times)
stats += f"Average response time: {avg_time:.2f} seconds\n"
return stats
def run(self) -> None:
start_time = time.time()
self.overall_total_count = 0
self.overall_processed_count = 0
proxies_by_type: Dict[str, List[str]] = {}
for proxy_type, url in self.proxy_urls.items():
if self.cancel_event.is_set():
self.log("info", "Cancellation detected. Aborting processing.")
return
proxies = self.get_proxies(url)
proxies_by_type[proxy_type] = proxies
self.overall_total_count += len(proxies)
if self.overall_total_count == 0:
self.log("warning", "No proxies fetched from any source.")
for proxy_type, proxies in proxies_by_type.items():
if self.cancel_event.is_set():
self.log("info", "Cancellation detected. Aborting further processing.")
break
self.process_proxies(proxy_type, proxies=proxies)
self.session.close()
end_time = time.time()
minutes, seconds = divmod(end_time - start_time, 60)
self.log("info", f"Total proxies checked: {self.total_proxies_checked}. Working proxies: {self.working_proxies_found}.")
self.log("info", f"Execution time: {int(minutes)} minutes {int(seconds)} seconds.")
self.log("info", "Statistics:\n" + self.get_statistics())
# Append history log
try:
with open("history.log", "a") as hist_file:
hist_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {self.get_statistics()}\n")
except OSError as e:
self.log("error", f"Failed to write history log: {e}")
class ProxyCheckerWorker(QObject):
"""
Worker class to run the proxy checking process in a separate thread.
Emits log messages, progress updates, and a finished signal.
"""
log_signal = pyqtSignal(str)
progress_update = pyqtSignal(int)
finished = pyqtSignal()
def __init__(self,
proxy_urls: Dict[str, str],
timeout: int,
max_retries: int,
retry_delay: float,
max_workers: int,
check_url: str,
detailed_results: bool,
export_format: str,
user_agent: Optional[str] = None):
super().__init__()
self.proxy_urls = proxy_urls
self.timeout = timeout
self.max_retries = max_retries
self.retry_delay = retry_delay
self.max_workers = max_workers
self.check_url = check_url
self.detailed_results = detailed_results
self.export_format = export_format
self.user_agent = user_agent
self.checker: Optional[ProxyChecker] = None
def log_callback(self, message: str) -> None:
self.log_signal.emit(message)
def progress_callback(self, progress: int) -> None:
self.progress_update.emit(progress)
def cancel(self) -> None:
if self.checker is not None:
self.checker.cancel()
def run(self) -> None:
self.checker = ProxyChecker(
proxy_urls=self.proxy_urls,
timeout=self.timeout,
max_retries=self.max_retries,
retry_delay=self.retry_delay,
max_workers=self.max_workers,
check_url=self.check_url,
detailed_results=self.detailed_results,
export_format=self.export_format,
user_agent=self.user_agent,
log_callback=self.log_callback,
progress_callback=self.progress_callback
)
self.log_callback("Starting proxy checking...")
self.checker.run()
self.log_callback("Proxy checking finished.")
self.finished.emit()
class UpdateChecker(QObject):
"""
Worker class to check for software updates.
"""
update_checked = pyqtSignal(str)
def run(self) -> None:
try:
response = requests.get("https://api.github.com/repos/Jesewe/proxy-checker/releases/latest", timeout=5)
response.raise_for_status()
data = response.json()
latest_version = data["tag_name"].lstrip("v")
if latest_version != CURRENT_VERSION:
msg = (f"New version available: {latest_version}.\n"
f"You are using version {CURRENT_VERSION}.\n"
f"Visit {data['html_url']} to download the update.")
else:
msg = f"You are up-to-date with version {CURRENT_VERSION}."
except Exception as e:
msg = f"Failed to check for updates: {e}"
self.update_checked.emit(msg)
class MainWindow(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle("Proxy Checker")
self.setGeometry(100, 100, 850, 750)
self.init_ui()
self.thread: Optional[QThread] = None
self.worker: Optional[ProxyCheckerWorker] = None
self.update_thread: Optional[QThread] = None
self.last_checker: Optional[ProxyChecker] = None
self.is_paused = False
def init_ui(self):
main_widget = QWidget()
main_layout = QVBoxLayout()
# Configuration group
config_group = QGroupBox("Settings")
config_layout = QGridLayout()
# Timeout
config_layout.addWidget(QLabel("Timeout (s):"), 0, 0)
self.timeout_spin = QSpinBox()
self.timeout_spin.setRange(1, 60)
self.timeout_spin.setValue(3)
config_layout.addWidget(self.timeout_spin, 0, 1)
# Max Retries
config_layout.addWidget(QLabel("Max Retries:"), 0, 2)
self.retries_spin = QSpinBox()
self.retries_spin.setRange(1, 10)
self.retries_spin.setValue(3)
config_layout.addWidget(self.retries_spin, 0, 3)
# Retry Delay
config_layout.addWidget(QLabel("Retry Delay (s):"), 1, 0)
self.retry_delay_spin = QDoubleSpinBox()
self.retry_delay_spin.setRange(0.1, 10.0)
self.retry_delay_spin.setSingleStep(0.1)
self.retry_delay_spin.setValue(1.0)
config_layout.addWidget(self.retry_delay_spin, 1, 1)
# Max Workers
config_layout.addWidget(QLabel("Max Workers:"), 1, 2)
self.workers_spin = QSpinBox()
self.workers_spin.setRange(1, 200)
self.workers_spin.setValue(50)
config_layout.addWidget(self.workers_spin, 1, 3)
# Test URL
config_layout.addWidget(QLabel("Test URL:"), 2, 0)
self.test_url_edit = QLineEdit("http://www.google.com")
config_layout.addWidget(self.test_url_edit, 2, 1, 1, 3)
# Custom User-Agent
config_layout.addWidget(QLabel("Custom User-Agent:"), 3, 0)
self.user_agent_edit = QLineEdit("")
self.user_agent_edit.setPlaceholderText("Leave blank for default")
config_layout.addWidget(self.user_agent_edit, 3, 1, 1, 3)
# Detailed Results Option
self.detailed_checkbox = QCheckBox("Detailed Results (Include Response Time, Anonymity & Geo)")
config_layout.addWidget(self.detailed_checkbox, 4, 0, 1, 2)
# Export Format Option
config_layout.addWidget(QLabel("Export Format:"), 4, 2)
self.export_format_combo = QComboBox()
self.export_format_combo.addItems(["txt", "csv", "json"])
config_layout.addWidget(self.export_format_combo, 4, 3)
config_group.setLayout(config_layout)
main_layout.addWidget(config_group)
# Proxy Sources Group
proxy_group = QGroupBox("Proxy Sources")
proxy_layout = QGridLayout()
self.proxy_urls = {
"http": "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt",
"socks4": "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt",
"socks5": "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt"
}
self.proxy_type_checkboxes = {}
self.proxy_url_edits = {}
row = 0
for proxy_type, url in self.proxy_urls.items():
checkbox = QCheckBox(proxy_type)
checkbox.setChecked(True)
self.proxy_type_checkboxes[proxy_type] = checkbox
proxy_layout.addWidget(checkbox, row, 0)
url_edit = QLineEdit(url)
self.proxy_url_edits[proxy_type] = url_edit
proxy_layout.addWidget(url_edit, row, 1)
row += 1
proxy_group.setLayout(proxy_layout)
main_layout.addWidget(proxy_group)
# Progress Bar
self.progress_bar = QProgressBar()
self.progress_bar.setRange(0, 100)
self.progress_bar.setValue(0)
main_layout.addWidget(self.progress_bar)
# Main Buttons
btn_layout = QHBoxLayout()
self.start_btn = QPushButton("Start Checking")
self.start_btn.clicked.connect(self.start_checking)
btn_layout.addWidget(self.start_btn)
self.pause_btn = QPushButton("Pause")
self.pause_btn.setEnabled(False)
self.pause_btn.clicked.connect(self.toggle_pause)
btn_layout.addWidget(self.pause_btn)
self.cancel_btn = QPushButton("Cancel")
self.cancel_btn.setEnabled(False)
self.cancel_btn.clicked.connect(self.cancel_checking)
btn_layout.addWidget(self.cancel_btn)
self.show_results_btn = QPushButton("Show Results")
self.show_results_btn.setEnabled(False)
self.show_results_btn.clicked.connect(self.show_results)
btn_layout.addWidget(self.show_results_btn)
main_layout.addLayout(btn_layout)
# Extra Buttons: Show Statistics, Save Log
extra_btn_layout = QHBoxLayout()
self.show_stats_btn = QPushButton("Show Statistics")
self.show_stats_btn.setEnabled(False)
self.show_stats_btn.clicked.connect(self.show_statistics)
extra_btn_layout.addWidget(self.show_stats_btn)
self.save_log_btn = QPushButton("Save Log")
self.save_log_btn.clicked.connect(self.save_log)
extra_btn_layout.addWidget(self.save_log_btn)
main_layout.addLayout(extra_btn_layout)
# Log Text Area
self.log_text = QTextEdit()
self.log_text.setReadOnly(True)
self.log_text.setStyleSheet("background-color: #1e1e1e; color: #d4d4d4; font-family: Consolas; font-size: 12pt;")
main_layout.addWidget(self.log_text)
main_widget.setLayout(main_layout)
self.setCentralWidget(main_widget)
def start_checking(self):
self.start_btn.setEnabled(False)
self.cancel_btn.setEnabled(True)
self.pause_btn.setEnabled(True)
self.show_results_btn.setEnabled(False)
self.show_stats_btn.setEnabled(False)
self.progress_bar.setValue(0)
self.log_text.clear()
# Build proxy_urls from selected checkboxes.
selected_proxy_urls = {}
for proxy_type, checkbox in self.proxy_type_checkboxes.items():
if checkbox.isChecked():
url = self.proxy_url_edits[proxy_type].text().strip()
if url:
selected_proxy_urls[proxy_type] = url
if not selected_proxy_urls:
QMessageBox.warning(self, "No Proxies Selected", "Please select at least one proxy type to check.")
self.start_btn.setEnabled(True)
self.cancel_btn.setEnabled(False)
self.pause_btn.setEnabled(False)
return
# Get settings from UI.
timeout = self.timeout_spin.value()
max_retries = self.retries_spin.value()
retry_delay = self.retry_delay_spin.value()
max_workers = self.workers_spin.value()
check_url = self.test_url_edit.text().strip()
detailed_results = self.detailed_checkbox.isChecked()
export_format = self.export_format_combo.currentText().strip()
user_agent = self.user_agent_edit.text().strip() or None
self.thread = QThread()
self.worker = ProxyCheckerWorker(
proxy_urls=selected_proxy_urls,
timeout=timeout,
max_retries=max_retries,
retry_delay=retry_delay,
max_workers=max_workers,
check_url=check_url,
detailed_results=detailed_results,
export_format=export_format,
user_agent=user_agent
)
self.worker.moveToThread(self.thread)
self.worker.log_signal.connect(self.append_log)
self.worker.progress_update.connect(self.progress_bar.setValue)
self.worker.finished.connect(self.on_finished)
self.thread.started.connect(self.worker.run)
self.thread.finished.connect(self.thread.deleteLater)
self.thread.start()
def toggle_pause(self):
if self.worker and self.worker.checker:
if not self.is_paused:
self.worker.checker.pause()
self.is_paused = True
self.pause_btn.setText("Resume")
self.append_log("Paused proxy checking.")
else:
self.worker.checker.resume()
self.is_paused = False
self.pause_btn.setText("Pause")
self.append_log("Resumed proxy checking.")
def cancel_checking(self):
if self.worker is not None:
self.append_log("Cancel requested by user...")
self.worker.cancel()
self.cancel_btn.setEnabled(False)
def append_log(self, message: str):
timestamp = time.strftime("%H:%M:%S")
self.log_text.append(f"[{timestamp}] {message}")
def on_finished(self):
self.append_log("All tasks completed.")
self.start_btn.setEnabled(True)
self.cancel_btn.setEnabled(False)
self.pause_btn.setEnabled(False)
self.show_results_btn.setEnabled(True)
self.show_stats_btn.setEnabled(True)
if self.thread is not None:
self.thread.quit()
self.thread.wait()
# Save a reference to the last checker for filtering results.
if self.worker:
self.last_checker = self.worker.checker
def show_results(self):
# If detailed results are enabled, allow filtering by response time.
if self.last_checker and self.last_checker.detailed_results:
dialog = QDialog(self)
dialog.setWindowTitle("Filtered Working Proxies")
dialog.resize(600, 500)
layout = QVBoxLayout()
filter_layout = QHBoxLayout()
filter_layout.addWidget(QLabel("Max Response Time (s):"))
filter_spin = QDoubleSpinBox()
filter_spin.setRange(0.1, 10.0)
filter_spin.setSingleStep(0.1)
filter_spin.setValue(1.0)
filter_layout.addWidget(filter_spin)
apply_btn = QPushButton("Apply Filter")
filter_layout.addWidget(apply_btn)
layout.addLayout(filter_layout)
result_area = QTextEdit()
result_area.setReadOnly(True)
layout.addWidget(result_area)
def apply_filter():
threshold = filter_spin.value()
text = ""
for ptype, results in self.last_checker.working_results.items():
filtered = []
for item in results:
if isinstance(item, dict) and item.get("response_time") <= threshold:
geo = item.get("geo", {})
filtered.append(f"{item.get('proxy')} - {item.get('response_time'):.2f} s - {item.get('anonymity')} - {geo.get('country', '')}")
if filtered:
text += f"--- {ptype} ---\n" + "\n".join(filtered) + "\n\n"
result_area.setText(text if text else "No proxies match the filter criteria.")
apply_btn.clicked.connect(apply_filter)
# Show all results initially
apply_filter()
btn_layout = QHBoxLayout()
copy_btn = QPushButton("Copy to Clipboard")
copy_btn.clicked.connect(lambda: QApplication.clipboard().setText(result_area.toPlainText()))
btn_layout.addWidget(copy_btn)
close_btn = QPushButton("Close")
close_btn.clicked.connect(dialog.close)
btn_layout.addWidget(close_btn)
layout.addLayout(btn_layout)
dialog.setLayout(layout)
dialog.exec()
else:
# Fallback: read the exported files from the proxies directory.
results_text = ""
proxy_dir = "proxies"
if os.path.isdir(proxy_dir):
for filename in os.listdir(proxy_dir):
filepath = os.path.join(proxy_dir, filename)
results_text += f"--- {filename} ---\n"
try:
with open(filepath, 'r') as f:
results_text += f.read() + "\n"
except OSError as e:
results_text += f"Error reading file: {e}\n"
else:
results_text = "No results found."
dialog = QDialog(self)
dialog.setWindowTitle("Working Proxies")
dialog.resize(600, 400)
dlg_layout = QVBoxLayout()
text_area = QTextEdit()
text_area.setReadOnly(True)
text_area.setText(results_text)
dlg_layout.addWidget(text_area)
btn_layout = QHBoxLayout()
copy_btn = QPushButton("Copy to Clipboard")
copy_btn.clicked.connect(lambda: QApplication.clipboard().setText(results_text))
btn_layout.addWidget(copy_btn)
close_btn = QPushButton("Close")
close_btn.clicked.connect(dialog.close)
btn_layout.addWidget(close_btn)
dlg_layout.addLayout(btn_layout)
dialog.setLayout(dlg_layout)
dialog.exec()
def show_statistics(self):
if self.worker and self.worker.checker:
stats = self.worker.checker.get_statistics()
else:
stats = "No statistics available."
QMessageBox.information(self, "Statistics", stats)
def save_log(self):
filename, _ = QFileDialog.getSaveFileName(self, "Save Log", "", "Text Files (*.txt);;All Files (*)")
if filename:
try:
with open(filename, 'w') as f:
f.write(self.log_text.toPlainText())
QMessageBox.information(self, "Saved", f"Log saved to {filename}")
except OSError as e:
QMessageBox.warning(self, "Error", f"Failed to save log: {e}")
def auto_check_for_update(self):
self.update_thread = QThread()
self.update_worker = UpdateChecker()
self.update_worker.moveToThread(self.update_thread)
self.update_worker.update_checked.connect(self.show_update_message)
self.update_thread.started.connect(self.update_worker.run)
self.update_thread.start()
def show_update_message(self, msg: str):
QMessageBox.information(self, "Update Check", msg)
self.update_thread.quit()
self.update_thread.wait()
def showEvent(self, event):
super().showEvent(event)
QTimer.singleShot(1000, self.auto_check_for_update)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
app = QApplication(sys.argv)
window = MainWindow()
window.show()
sys.exit(app.exec())

View File

@ -1,941 +0,0 @@
from airflow import DAG
from airflow.models import BaseOperator, Variable
from airflow.utils.decorators import apply_defaults
from airflow.hooks.base import BaseHook
from airflow.exceptions import AirflowException
from airflow.utils.dates import days_ago
from thrift.transport import TSocket, TTransport
from thrift.protocol import TBinaryProtocol
from thrift.transport.TTransport import TTransportException
from datetime import datetime, timedelta
from pangramia.yt.exceptions.ttypes import PBServiceException
import redis
import logging
import time
import socket
import json
import os
from pangramia.yt.tokens_ops import YTTokenOpService
from pangramia.yt.common.ttypes import TokenUpdateMode
from airflow.providers.redis.hooks.redis import RedisHook
from airflow.operators.python import PythonOperator
from airflow.models.param import Param
# Assuming ytdlp_utils exists in the same directory or PYTHONPATH
# from ytdlp_utils import get_info_json, is_valid_json, extract_video_id
# Configure logging
logger = logging.getLogger(__name__)
# Default settings (similar to ytdlp_client_dag.py)
MAX_RETRIES = 1
RETRY_DELAY = timedelta(seconds=10)
DEFAULT_TIMEOUT = 30
class YtdlpOpsOperator(BaseOperator):
"""
Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections
and Redis-based discovery, retrieves tokens, saves info.json, and manages errors.
"""
template_fields = ('url', 'service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir')
@apply_defaults
def __init__(self, url, redis_conn_id='redis_default', max_retries=3, retry_delay=10,
service_ip=None, service_port=None, redis_enabled=False, account_id=None,
save_info_json=True, info_json_dir=None, get_socks_proxy=True,
store_socks_proxy=False, timeout=DEFAULT_TIMEOUT, *args, **kwargs):
super().__init__(*args, **kwargs)
logger.info(f"Initializing YtdlpOpsOperator with parameters: url={url}, "
f"redis_conn_id={redis_conn_id}, max_retries={max_retries}, retry_delay={retry_delay}, "
f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, "
f"account_id={account_id}, save_info_json={save_info_json}, info_json_dir={info_json_dir}, "
f"get_socks_proxy={get_socks_proxy}, store_socks_proxy={store_socks_proxy}, timeout={timeout}")
# Validate required parameters
if not url:
raise ValueError("url is required")
# Validate parameters based on connection mode
if redis_enabled:
if not account_id:
raise ValueError("account_id is required when redis_enabled=True")
# Use default Redis connection if not specified
if not redis_conn_id:
redis_conn_id = 'redis_default'
logger.info(f"Using default Redis connection ID: {redis_conn_id}")
else:
if not service_ip or not service_port:
raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False")
if not account_id:
logger.warning("No account_id provided for direct connection mode. Using 'default'")
account_id = 'default' # Assign default if missing in direct mode
self.url = url
self.redis_conn_id = redis_conn_id
self.max_retries = max_retries
self.retry_delay = int(retry_delay.total_seconds() if isinstance(retry_delay, timedelta) else retry_delay)
self.service_ip = service_ip
self.service_port = service_port
self.redis_enabled = redis_enabled
self.account_id = account_id
self.save_info_json = save_info_json
self.info_json_dir = info_json_dir
self.get_socks_proxy = get_socks_proxy
self.store_socks_proxy = store_socks_proxy
self.timeout = timeout
def execute(self, context):
logger.info("Executing YtdlpOpsOperator")
transport = None
try:
logger.info("Getting task parameters")
params = context.get('params', {})
redis_enabled = params.get('redis_enabled', self.redis_enabled)
logger.info(f"Using redis_enabled={redis_enabled} (from {'task params' if 'redis_enabled' in params else 'operator init'})")
# Determine account_id to use (from params or operator default)
account_id = context['params'].get('account_id', self.account_id)
logger.info(f"Using account_id='{account_id}' (from {'task params' if 'account_id' in params else 'operator init'})")
if redis_enabled:
# Get Redis connection with proper authentication and error handling
redis_conn = BaseHook.get_connection(self.redis_conn_id)
redis_client = redis.Redis(
host=redis_conn.host,
port=redis_conn.port,
password=redis_conn.password,
db=0,
decode_responses=True # Important for consistent key handling
)
# Test Redis connection
try:
if not redis_client.ping():
raise redis.exceptions.ConnectionError("Redis ping failed")
logger.info(f"Successfully connected to Redis at {redis_conn.host}:{redis_conn.port}")
except redis.exceptions.AuthenticationError:
logger.error(f"Redis authentication failed for connection '{self.redis_conn_id}'. Check password.")
raise AirflowException("Redis authentication failed.")
except redis.exceptions.ConnectionError as e:
logger.error(f"Could not connect to Redis at {redis_conn.host}:{redis_conn.port}. Error: {e}")
raise AirflowException(f"Redis connection failed: {e}")
except Exception as e:
logger.error(f"Unexpected Redis error: {str(e)}")
raise AirflowException(f"Unexpected Redis error: {e}")
# Get service details from Redis with retries and proper key handling
service_key = f"ytdlp:{account_id}"
legacy_key = account_id # For backward compatibility
host = None
port = None
for attempt in range(self.max_retries):
try:
logger.info(f"Attempt {attempt + 1}/{self.max_retries}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'")
service_details = redis_client.hgetall(service_key)
if not service_details:
logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'")
service_details = redis_client.hgetall(legacy_key)
if not service_details:
raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}")
# Find IP and port, handling potential case differences and byte/string types
ip_key = next((k for k in service_details if k.lower() == 'ip'), None)
port_key = next((k for k in service_details if k.lower() == 'port'), None)
if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}")
if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}")
host = service_details[ip_key] # Already decoded due to decode_responses=True
port_str = service_details[port_key]
try:
port = int(port_str)
except ValueError:
raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}")
logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}")
break # Success
except Exception as e:
logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}")
if attempt == self.max_retries - 1:
logger.error("Max retries reached for fetching Redis details.")
raise AirflowException(f"Failed to get service details from Redis after {self.max_retries} attempts: {e}")
logger.info(f"Retrying in {self.retry_delay} seconds...")
time.sleep(self.retry_delay)
else:
# Direct connection: Log parameter sources
params = context.get('params', {})
host = params.get('service_ip', self.service_ip)
host_source = 'task params' if 'service_ip' in params else 'operator init'
port_str = params.get('service_port', self.service_port)
port_source = 'task params' if 'service_port' in params else 'operator init'
url = params.get('url', self.url)
url_source = 'task params' if 'url' in params else 'operator init'
logger.info(f"Using service_ip={host} (from {host_source})")
logger.info(f"Using service_port={port_str} (from {port_source})")
logger.info(f"Using url={url} (from {url_source})")
if not host or not port_str:
raise ValueError("Direct connection requires service_ip and service_port")
try:
port = int(port_str)
except ValueError:
raise ValueError(f"Invalid service_port value: {port_str}")
logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)")
# Render and validate timeout
timeout_param = context.get('params', {}).get('timeout', self.timeout)
if isinstance(self.timeout, str) and '{{' in self.timeout:
timeout_rendered = self.render_template(self.timeout, context)
logger.info(f"Rendered timeout template: '{self.timeout}' -> '{timeout_rendered}'")
timeout_param = timeout_rendered
try:
timeout = int(timeout_param)
if timeout <= 0: raise ValueError("Timeout must be positive")
logger.info(f"Using timeout: {timeout} seconds")
except (ValueError, TypeError):
logger.warning(f"Invalid timeout value: '{timeout_param}'. Using default: {DEFAULT_TIMEOUT}")
timeout = DEFAULT_TIMEOUT
# Create Thrift connection objects
socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4)
socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds
transport = TTransport.TFramedTransport(socket_conn)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = YTTokenOpService.Client(protocol)
logger.info(f"Attempting to connect to Thrift server at {host}:{port}...")
try:
transport.open()
logger.info("Successfully connected to Thrift server.")
# Test connection with ping
try:
client.ping()
logger.info("Server ping successful.")
except Exception as e:
logger.error(f"Server ping failed: {e}")
raise AirflowException(f"Server connection test (ping) failed: {e}")
# Get token from service with specific error handling
try:
url_param = context.get('params', {}).get('url', self.url)
logger.info(f"Requesting token for accountId='{account_id}', url='{url_param}'")
token_data = client.getOrRefreshToken(
accountId=account_id,
updateType=TokenUpdateMode.AUTO,
url=url_param
)
logger.info("Successfully retrieved token data from service.")
except PBServiceException as e:
logger.error(f"PBServiceException occurred: Code={getattr(e, 'errorCode', 'N/A')}, Message={getattr(e, 'message', 'N/A')}")
error_code = getattr(e, 'errorCode', None)
error_msg = f"YTDLP service error: {getattr(e, 'message', str(e))}"
# Handle specific known error codes
if error_code in [
"SOCKS5_CONNECTION_FAILED", "SOCKS5_TIMEOUT",
"SOCKS5_CONNECTION_REFUSED", "SOCKS5_CONNECTION_TIMEOUT",
"SOCKS5_HOST_NOT_FOUND", "SOCKS5_NETWORK_UNREACHABLE"
]:
error_msg = f"SOCKS5 proxy error ({error_code}): {e.message}. Check proxy settings."
elif error_code == "BOT_DETECTION":
error_msg = f"Bot detection triggered ({error_code}): {e.message}."
suggestions = getattr(e, 'context', {}).get('suggestions', [])
if suggestions: error_msg += "\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions)
elif error_code == "NODEJS_SCRIPT_ERROR":
error_msg = f"Node.js script error ({error_code}): {e.message}."
elif error_code == "NODEJS_TIMEOUT":
error_msg = f"Node.js timeout ({error_code}): {e.message}."
# Add more specific error handling as needed
raise AirflowException(error_msg)
except TTransportException as e:
logger.error(f"Thrift transport error during getOrRefreshToken: {e}")
raise AirflowException(f"Transport error during API call: {e}")
except Exception as e:
logger.error(f"Unexpected error during getOrRefreshToken: {e}")
raise AirflowException(f"Unexpected error during API call: {e}")
except TTransportException as e:
# Handle connection-specific transport errors
if "read 0 bytes" in str(e) or "Could not connect to" in str(e) or "Connection refused" in str(e):
logger.error(f"Connection failed to {host}:{port}. Details: {e}")
logger.error("Possible causes: Server down, firewall block, incorrect IP/port.")
raise AirflowException(f"Failed to connect to YTDLP service at {host}:{port}: {e}")
else:
logger.error(f"Thrift transport error during connection: {str(e)}")
raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}")
except Exception as e:
logger.error(f"Unexpected error during connection or ping: {str(e)}")
raise # Re-raise other unexpected errors
# Log received token data attributes for debugging
logger.debug(f"Token data received. Attributes: {dir(token_data)}")
for attr in dir(token_data):
if not attr.startswith('__') and not callable(getattr(token_data, attr)): # Log non-callable attributes
value = getattr(token_data, attr)
if attr == 'infoJson' and value:
logger.debug(f"infoJson: {value[:50]}...")
else:
logger.debug(f"{attr}: {value}")
info_json_path = None # Initialize info_json_path
save_info_json_param = context['params'].get('save_info_json', self.save_info_json)
# Render if it's a string template
if isinstance(save_info_json_param, str):
save_info_json_rendered = self.render_template(save_info_json_param, context)
# Convert common string representations to boolean
save_info_json = str(save_info_json_rendered).lower() in ['true', '1', 't', 'y', 'yes']
else:
save_info_json = bool(save_info_json_param)
# Save info.json if requested and valid
if self.save_info_json:
info_json = self._get_info_json(token_data)
if info_json and self._is_valid_json(info_json):
try:
# Use internal _save_info_json method which handles rendering, dir creation, logging
info_json_path = self._save_info_json(context, info_json)
if info_json_path: # Check if saving was successful
context['task_instance'].xcom_push(key='info_json_path', value=info_json_path)
logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}")
else:
# _save_info_json should log errors, push None to indicate failure
context['task_instance'].xcom_push(key='info_json_path', value=None)
logger.warning("info.json saving failed (check logs from _save_info_json), pushing None to XCom for info_json_path.")
except Exception as e:
logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True)
context['task_instance'].xcom_push(key='info_json_path', value=None) # Push None on error
elif info_json:
logger.warning("Retrieved infoJson is not valid JSON. Skipping save.")
context['task_instance'].xcom_push(key='info_json_path', value=None)
else:
logger.info("No infoJson found in token data. Skipping save.")
context['task_instance'].xcom_push(key='info_json_path', value=None)
else:
logger.info("save_info_json is False. Skipping info.json save.")
context['task_instance'].xcom_push(key='info_json_path', value=None)
# Extract and potentially store SOCKS proxy
socks_proxy = None
if self.get_socks_proxy: # Use instance attribute
# Check for common attribute names for proxy
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
if proxy_attr:
socks_proxy = getattr(token_data, proxy_attr)
if socks_proxy: # Ensure proxy value is not empty
logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
if self.store_socks_proxy: # Use instance attribute
context['task_instance'].xcom_push(key='socks_proxy', value=socks_proxy)
logger.info(f"Pushed key 'socks_proxy' to XCom with value: {socks_proxy}")
else:
logger.info("SOCKS proxy extracted but not pushed to XCom (store_socks_proxy=False).")
else:
logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty. No proxy extracted.")
# Push None even if found but empty, if storing is enabled
if self.store_socks_proxy: # Use instance attribute
context['task_instance'].xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.")
else:
logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found in token data.")
# Push None if storing is enabled but attribute not found
if self.store_socks_proxy: # Use instance attribute
context['task_instance'].xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.")
else:
logger.info("get_socks_proxy is False. Skipping proxy extraction.")
# Push None if storing is enabled but extraction was skipped
if self.store_socks_proxy: # Use instance attribute
context['task_instance'].xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as get_socks_proxy=False.")
# Get the original command from the server
ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
if not ytdlp_cmd:
logger.error("No 'ytdlpCommand' attribute found in token data.")
raise AirflowException("Required 'ytdlpCommand' not received from service.")
logger.info(f"Original command received from server: {ytdlp_cmd}")
# Log example usage command (DO NOT MODIFY the original command here)
if info_json_path:
# Use double quotes for paths/proxy in example for robustness
example_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
if socks_proxy:
example_cmd += f" --proxy \"{socks_proxy}\""
example_cmd += " --verbose --simulate" # Add useful flags for testing
logger.info(f"\n--- Example usage with saved info.json ---")
logger.info(example_cmd)
logger.info(f"(Note: The actual command with tokens/cookies is pushed to XCom as 'ytdlp_command')")
latest_json_path = os.path.join(os.path.dirname(info_json_path), 'latest.json')
logger.info(f"(You can also use 'latest.json': {latest_json_path})")
logger.info(f"-------------------------------------------\n")
else:
logger.info("\n--- Original command pushed to XCom ('ytdlp_command') ---")
if socks_proxy:
logger.info(f"Use the extracted proxy '{socks_proxy}' (pushed to XCom if store_socks_proxy=True) with the --proxy flag.")
logger.info("Add --verbose and --simulate flags for testing the command.")
logger.info(f"-------------------------------------------------------\n")
# Push the *original* command to XCom
context['task_instance'].xcom_push(key='ytdlp_command', value=ytdlp_cmd)
logger.info(f"Pushed original command to XCom key 'ytdlp_command'.")
# Note: Returning ytdlp_cmd below implicitly pushes the same value
# to XCom under the key 'return_value'. Downstream tasks should
# preferably use the explicitly pushed 'ytdlp_command' key for clarity.
return ytdlp_cmd # Return the original command
except AirflowException as e: # Catch AirflowExceptions raised explicitly in the code above
logger.error(f"Operation failed due to AirflowException: {e}")
raise # Re-raise AirflowExceptions to ensure task failure
except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already wrapped
logger.error(f"Unhandled Thrift/Service error: {e}", exc_info=True) # Add traceback for context
raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException
except Exception as e: # General catch-all for truly unexpected errors
# Log with traceback for unexpected errors
logger.error(f"Caught unexpected error in YtdlpOpsOperator: {e}", exc_info=True)
# Ensure any unexpected error explicitly fails the task with AirflowException
raise AirflowException(f"Unexpected error caused task failure: {e}")
finally:
if transport and transport.isOpen(): # Check if transport exists and is open before closing
logger.info("Closing Thrift transport.")
transport.close()
# --- Helper Methods ---
def _get_info_json(self, token_data):
"""Safely extracts infoJson from token data."""
info_json = getattr(token_data, 'infoJson', None)
if info_json:
logger.debug("Extracted infoJson from token data.")
else:
logger.debug("No infoJson attribute found in token data.")
return info_json
def _is_valid_json(self, json_str):
"""Checks if a string is valid JSON."""
if not json_str or not isinstance(json_str, str):
logger.debug("Input is not a non-empty string, considered invalid JSON.")
return False
try:
json.loads(json_str)
logger.debug("JSON string validation successful.")
return True
except json.JSONDecodeError as e:
logger.warning(f"JSON validation failed: {e}")
return False
def _save_info_json(self, context, info_json):
"""Saves info_json to a file, handling directory creation and logging. Returns the path on success, None on failure."""
try:
# Get URL from params/context for video ID extraction
url_param = context.get('params', {}).get('url', self.url)
video_id = self._extract_video_id(url_param) # Use internal helper
# Render the info_json_dir template
save_dir_template = self.info_json_dir or "." # Default to current dir if template is None or empty string
save_dir = self.render_template(save_dir_template, context)
if not save_dir: # Handle case where template renders to empty string
logger.warning(f"Rendered info_json_dir template '{save_dir_template}' resulted in an empty path. Defaulting to '.'")
save_dir = "."
logger.info(f"Target directory for info.json (rendered): {save_dir}")
# Ensure directory exists
try:
os.makedirs(save_dir, exist_ok=True)
logger.info(f"Ensured directory exists: {save_dir}")
except OSError as e:
logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.")
return None # Indicate failure
# Construct filename (using potentially overridden account_id)
account_id_param = context.get('params', {}).get('account_id', self.account_id)
timestamp = int(time.time())
base_filename = f"info_{video_id}_{account_id_param}_{timestamp}.json" if video_id else f"info_{account_id_param}_{timestamp}.json"
info_json_path = os.path.join(save_dir, base_filename)
latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy
# Write to timestamped file
try:
logger.info(f"Writing info.json content (received from service) to {info_json_path}...")
with open(info_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}")
except IOError as e:
logger.error(f"Failed to write info.json to {info_json_path}: {e}")
return None # Indicate failure
# Write to latest.json (overwrite) - best effort
try:
with open(latest_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
logger.info(f"Updated latest.json file: {latest_json_path}")
except IOError as e:
# Log warning but don't fail the whole save if only latest.json fails
logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}")
return info_json_path # Return path on success (even if latest.json failed)
except Exception as e:
logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True)
return None # Indicate failure
def _extract_video_id(self, url):
"""Extracts YouTube video ID from URL (internal helper)."""
if not url or not isinstance(url, str):
logger.debug("URL is empty or not a string, cannot extract video ID.")
return None
try:
# Basic extraction logic (can be enhanced for more URL types)
video_id = None
if 'youtube.com/watch?v=' in url:
video_id = url.split('v=')[1].split('&')[0]
elif 'youtu.be/' in url:
video_id = url.split('youtu.be/')[1].split('?')[0]
# Ensure it looks like a video ID (typically 11 chars, but can vary)
if video_id and len(video_id) >= 11:
video_id = video_id[:11] # Take first 11 chars as standard ID length
logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
return video_id
else:
logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
return None
except Exception as e:
logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
return None
# =============================================================================
# Python Callables for Tasks
# =============================================================================
def display_token_info(**context):
"""Displays token info from XCom, parses info.json, and logs example commands."""
ti = context['task_instance']
logger.info("Starting display_token_info task.")
# Pull data from XCom (provide default values)
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
logger.info("\n=== Pulled Token Information from XCom ===")
logger.info(f"Info.json path: {info_json_path or 'Not found/Not saved'}")
logger.info(f"SOCKS Proxy: {socks_proxy or 'Not found/Not extracted'}")
logger.info(f"Original yt-dlp command (with tokens): {ytdlp_command or 'Not found'}")
result = {
'info_path': info_json_path,
'proxy': socks_proxy,
'ytdlp_command': ytdlp_command,
'video_info': None,
'commands': {},
'error': None
}
if info_json_path and os.path.exists(info_json_path):
logger.info(f"\n=== Processing Video Information from: {info_json_path} ===")
try:
with open(info_json_path, 'r', encoding='utf-8') as f:
info = json.load(f)
# Extract and log basic video info safely
title = info.get('title', 'Unknown Title')
uploader = info.get('uploader', 'Unknown Author')
duration = info.get('duration_string', 'Unknown Length')
upload_date_str = info.get('upload_date') # Format: YYYYMMDD
upload_date_formatted = 'Unknown Date'
if upload_date_str:
try:
# Validate format before parsing
if len(upload_date_str) == 8 and upload_date_str.isdigit():
upload_date_formatted = datetime.strptime(upload_date_str, '%Y%m%d').strftime('%Y-%m-%d')
else:
logger.warning(f"Upload date '{upload_date_str}' is not in YYYYMMDD format.")
except ValueError:
logger.warning(f"Could not parse upload_date '{upload_date_str}'")
result['video_info'] = {
'title': title,
'uploader': uploader,
'upload_date': upload_date_formatted, # Store formatted date
'duration': duration
}
logger.info(f"Title: {title}")
logger.info(f"Author: {uploader}")
logger.info(f"Date: {upload_date_formatted}")
logger.info(f"Length: {duration}")
logger.info("\n=== Example yt-dlp Commands (using saved info.json) ===")
base_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
if socks_proxy:
base_cmd += f" --proxy \"{socks_proxy}\""
# Command to list formats
format_cmd = f"{base_cmd} -F"
result['commands']['format'] = format_cmd
logger.info(f"List formats command: {format_cmd}")
# Execute and log the format listing command
logger.info("\n--- Executing Format List Command ---")
try:
# Use os.popen for simplicity, capture output
logger.info(f"Running: {format_cmd}")
format_output = os.popen(format_cmd).read()
logger.info("--- Format List Output ---")
logger.info(format_output)
logger.info("--------------------------")
except Exception as e:
logger.error(f"Error executing format command: {e}")
# Command to simulate download
simulate_cmd = f"{base_cmd} --simulate --verbose" # Add verbose for more info
result['commands']['simulate'] = simulate_cmd
logger.info(f"Simulate download command: {simulate_cmd}")
# Execute and log the simulation command
logger.info("\n--- Executing Simulation Command ---")
try:
logger.info(f"Running: {simulate_cmd}")
simulate_output = os.popen(simulate_cmd).read()
logger.info("--- Simulation Output ---")
logger.info(simulate_output)
logger.info("-------------------------")
except Exception as e:
logger.error(f"Error executing simulation command: {e}")
# Basic download command
download_cmd = base_cmd
result['commands']['download_base'] = download_cmd
logger.info(f"Base download command (add format selection, output path): {download_cmd}")
# Push generated example commands to XCom for potential downstream use
# ti.xcom_push(key='format_cmd', value=format_cmd) # Removed as requested
# ti.xcom_push(key='simulate_cmd', value=simulate_cmd) # Removed as requested
ti.xcom_push(key='download_cmd', value=download_cmd)
logger.info(f"Pushed key 'download_cmd' to XCom with value: {download_cmd}")
except json.JSONDecodeError as e:
error_msg = f"Failed to parse info.json file '{info_json_path}': {e}"
logger.error(error_msg)
result['error'] = error_msg
except FileNotFoundError:
error_msg = f"Info.json file not found at path: {info_json_path}"
logger.error(error_msg)
result['error'] = error_msg
except Exception as e:
error_msg = f"Error processing info.json file '{info_json_path}': {str(e)}"
logger.error(error_msg, exc_info=True)
result['error'] = error_msg
elif info_json_path:
error_msg = f"Info.json path provided ('{info_json_path}') but file does not exist."
logger.warning(error_msg)
result['error'] = error_msg
else:
logger.warning("No info.json path found in XCom. Cannot display video details or generate example commands.")
result['error'] = "Info.json path not available."
logger.info("Finished display_token_info task.")
# Return the collected information (useful if used as a PythonOperator return value)
return json.dumps(result) # Return as JSON string for XCom compatibility if needed
def store_token_info(**context):
"""Stores retrieved token information (command, proxy, info.json) in Redis."""
ti = context['task_instance']
# Use the redis_conn_id defined in the operator/DAG params if possible, else default
redis_conn_id = context['params'].get('redis_conn_id', 'redis_default')
redis_hook = RedisHook(redis_conn_id=redis_conn_id)
logger.info(f"Starting store_token_info task using Redis connection '{redis_conn_id}'.")
try:
# Pull necessary data from XCom and context
url = context['params'].get('url')
if not url:
# Attempt to get URL from DAG run conf as fallback
url = context.get('dag_run', {}).conf.get('url')
if not url:
raise ValueError("URL parameter is missing in context['params'] and dag_run.conf")
logger.warning("URL parameter missing in context['params'], using URL from dag_run.conf.")
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') or '' # Default to empty string if None
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
if not ytdlp_command:
logger.warning("ytdlp_command not found in XCom. Storing empty value.")
ytdlp_command = '' # Store empty if not found
# Construct the base command using info.json
ytdlp_command_base = ''
if info_json_path and os.path.exists(info_json_path):
ytdlp_command_base = f"yt-dlp --load-info-json \"{info_json_path}\""
logger.info(f"Constructed base command: {ytdlp_command_base}")
else:
logger.warning("Cannot construct base command: info_json_path not valid.")
# Construct the command with tokens and proxy
ytdlp_command_tokens = ytdlp_command # Start with original command from server
if socks_proxy:
ytdlp_command_tokens += f" --proxy \"{socks_proxy}\""
logger.info("Appended proxy to token command.")
data_to_store = {
'url': url,
'ytdlp_command': ytdlp_command_base, # Store the base command
'proxy': socks_proxy,
'info_json_path': info_json_path or '' # Store path even if None/empty
# 'info_json' will be added below
}
# Read info.json content if path exists
info_json_content = None
if info_json_path and os.path.exists(info_json_path):
try:
with open(info_json_path, 'r', encoding='utf-8') as f:
# Read and immediately validate JSON structure before storing
info_json_content = json.load(f)
# Store the validated JSON as a string
data_to_store['info_json'] = json.dumps(info_json_content)
logger.info(f"Read and validated info.json content from: {info_json_path}")
except json.JSONDecodeError as e:
logger.error(f"Failed to parse info.json file '{info_json_path}' as JSON: {e}. Storing empty content.")
data_to_store['info_json'] = '' # Store empty string on parse error
except Exception as e:
logger.error(f"Failed to read info.json file '{info_json_path}': {e}. Storing empty content.")
data_to_store['info_json'] = '' # Store empty string on other read errors
else:
logger.warning(f"info_json_path ('{info_json_path}') not found or invalid. Storing without info_json content.")
data_to_store['info_json'] = '' # Store empty string if no path
# Determine Redis key using video ID
# Use the same helper method as the operator for consistency
# Need an instance or static method call. Let's make _extract_video_id static temporarily
# Or instantiate the operator just for this - less ideal.
# Simplest: Re-implement or assume utils.
# Re-implementing basic logic here for simplicity:
video_id = None
try:
if 'youtube.com/watch?v=' in url:
video_id = url.split('v=')[1].split('&')[0][:11]
elif 'youtu.be/' in url:
video_id = url.split('youtu.be/')[1].split('?')[0][:11]
except Exception:
pass # Ignore errors in ID extraction for key generation
redis_key = f"token_info:{video_id or 'unknown'}"
logger.info(f"Determined Redis key: {redis_key}")
# Store data in Redis hash
# Log presence/absence rather than full content for potentially large fields
logger.info(f"Data to store in Redis key '{redis_key}': "
f"URL='{data_to_store['url']}', "
f"Command={'<present>' if data_to_store['ytdlp_command'] else '<empty>'}, "
f"Proxy='{data_to_store['proxy'] or '<empty>'}', "
f"Path='{data_to_store['info_json_path'] or '<empty>'}', "
f"JSON Content={'<present>' if data_to_store.get('info_json') else '<empty>'}")
with redis_hook.get_conn() as redis_client:
# Extract video ID from URL
video_id = None
try:
if 'youtube.com/watch?v=' in url:
video_id = url.split('v=')[1].split('&')[0][:11]
elif 'youtu.be/' in url:
video_id = url.split('youtu.be/')[1].split('?')[0][:11]
except Exception:
pass # Ignore errors in ID extraction for key generation
# Use video ID as part of the Redis key
redis_key = f"token_info:{video_id or 'unknown'}"
logger.info(f"Determined Redis key: {redis_key}")
# Store data in Redis hash
# Add video_id, timestamp, and the constructed ytdlp_command_tokens
data_to_store['video_id'] = video_id or 'unknown'
data_to_store['timestamp'] = int(time.time())
data_to_store['ytdlp_command_tokens'] = ytdlp_command_tokens # Store the original token command
# Log fields being stored
log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
logger.info(f"Storing in Redis key '{redis_key}': {log_data}")
redis_client.hset(redis_key, mapping=data_to_store)
# Set expiration (e.g., 24 hours = 86400 seconds)
redis_client.expire(redis_key, 86400)
logger.info(f"Successfully stored token info in Redis key '{redis_key}' with 24h expiration.")
# Log the final stored data again for clarity
final_log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
logger.info(f"--- Final Data Stored in Redis Key '{redis_key}' ---")
logger.info(final_log_data)
logger.info("----------------------------------------------------")
except Exception as e:
logger.error(f"Failed to store token info in Redis: {e}", exc_info=True)
# Re-raise as AirflowException to fail the task
raise AirflowException(f"Failed to store token info in Redis: {e}")
logger.info("Finished store_token_info task.")
# =============================================================================
# DAG Definition
# =============================================================================
# Update default_args to match ytdlp_client_dag.py structure
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False, # Match reference DAG
'email_on_retry': False, # Match reference DAG
'retries': 1, # Default task retries
'retry_delay': timedelta(minutes=5), # Standard task retry delay
'start_date': days_ago(1) # Best practice start date
}
# Update DAG definition
with DAG(
dag_id='ytdlp_client_dag_v2.1',
default_args=default_args,
schedule_interval=None, # Manually triggered DAG
catchup=False, # Don't run for past missed schedules
description='DAG for YTDLP operations using Thrift client (V2 - Refactored)', # Updated description
tags=['ytdlp', 'thrift', 'client', 'v2'], # Updated tags for better filtering
params={
# Define DAG parameters with defaults and types for UI clarity
'url': Param('https://www.youtube.com/watch?v=sOlTX9uxUtM', type=["null", "string"], description="Required: The video URL to process."), # Default URL
'redis_enabled': Param(False, type="boolean", description="Use Redis for service discovery? If False, uses service_ip/port."), # Default to direct connection
'service_ip': Param('85.192.30.55', type="string", description="Service IP if redis_enabled=False."), # Default service IP
'service_port': Param(9090, type="integer", description="Service port if redis_enabled=False."), # Default service port
'account_id': Param('account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09', type="string", description="Account ID for Redis lookup or direct call."), # Updated default account_id
'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
# Use Airflow Variable for downloads directory, matching reference DAG structure
'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json. Uses Airflow Variable 'DOWNLOADS_TEMP' or default.")
}
) as dag:
# Define Tasks
get_token = YtdlpOpsOperator(
task_id='get_token',
# Pass templated parameters from DAG run config
url="{{ params.url }}",
redis_enabled="{{ params.redis_enabled }}",
service_ip="{{ params.service_ip }}",
service_port="{{ params.service_port }}",
account_id="{{ params.account_id }}",
save_info_json=True,
info_json_dir="{{ params.info_json_dir }}",
get_socks_proxy=True,
store_socks_proxy=True,
timeout="{{ params.timeout }}",
retries=MAX_RETRIES, # Operator-specific retries if needed, else use DAG default
retry_delay=RETRY_DELAY, # Operator-specific delay if needed
# Add callbacks for logging success/failure, similar to reference DAG
on_failure_callback=lambda context: logger.error(f"Task {context['task_instance_key_str']} failed."),
on_success_callback=lambda context: logger.info(f"Task {context['task_instance_key_str']} succeeded.")
)
# Add task documentation (visible in Airflow UI)
get_token.doc_md = """
### Get Token Task
Connects to the YTDLP Thrift service (either directly or via Redis discovery)
to retrieve an authentication token and video metadata (info.json).
**Pushes to XCom:**
- `info_json_path`: Path to the saved info.json file (or None if not saved/failed).
- `socks_proxy`: The extracted SOCKS proxy string (or None if not requested/found).
- `ytdlp_command`: The original command string received from the server (contains tokens/cookies).
- Uses parameters defined in the DAG run configuration.
"""
# Optional: Add a task to explicitly check XComs for debugging (like in reference DAG)
def _check_xcom_callable(**context):
"""Logs XCom values pushed by the get_token task."""
ti = context['task_instance']
logger.info("--- Checking XCom values pushed by get_token ---")
keys_to_check = ['info_json_path', 'socks_proxy', 'ytdlp_command']
xcom_values = {}
for key in keys_to_check:
value = ti.xcom_pull(task_ids='get_token', key=key)
xcom_values[key] = value
# Avoid logging potentially sensitive command details fully in production
if key == 'ytdlp_command' and value:
log_value = f"{value[:50]}..." # Log truncated command
else:
log_value = value
logger.info(f"XCom key='{key}': {log_value}")
logger.info("----------------------------------------------")
return xcom_values # Return values for potential future use
check_xcom_task = PythonOperator(
task_id='check_xcom_after_get_token',
python_callable=_check_xcom_callable,
)
check_xcom_task.doc_md = "Logs the values pushed to XCom by the 'get_token' task for debugging purposes."
display_info = PythonOperator(
task_id='display_token_info',
python_callable=display_token_info,
trigger_rule='all_success'
)
display_info.doc_md = """
### Display Token Info Task
Pulls information from XCom, parses the `info.json` file (if available),
logs video details, and generates example `yt-dlp` commands.
**Pulls from XCom (task_id='get_token'):**
- `info_json_path`
- `socks_proxy`
- `ytdlp_command`
**Pushes to XCom:**
- `download_cmd`: Base command using `--load-info-json` (user needs to add format/output).
"""
store_info = PythonOperator(
task_id='store_token_info', # Use consistent task ID naming
python_callable=store_token_info,
)
store_info.doc_md = """
### Store Token Info Task
Pulls information from XCom and DAG parameters, reads the `info.json` content,
and stores relevant data in a Redis hash.
**Pulls from XCom (task_id='get_token'):**
- `ytdlp_command`
- `socks_proxy`
- `info_json_path`
**Pulls from DAG context:**
- `params['url']` (or `dag_run.conf['url']`)
**Stores in Redis Hash (key: `token_info:<video_id>`):**
- `url`: The video URL.
- `ytdlp_command`: Base command using `--load-info-json`.
- `proxy`: The SOCKS proxy string.
- `info_json_path`: Path to the saved info.json file.
- `info_json`: The full content of the info.json file (as a JSON string).
- `video_id`: Extracted video ID.
- `timestamp`: Unix timestamp of storage.
- `ytdlp_command_tokens`: The original command string from the server (contains tokens/cookies).
Sets a 24-hour expiration on the Redis key.
"""
# Define task dependencies matching the reference DAG structure
get_token >> check_xcom_task >> display_info >> store_info

View File

@ -1,179 +0,0 @@
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2024 rl <rl@rlmbp>
#
# Distributed under terms of the MIT license.
"""
Airflow DAG for manually checking the status (type and size) of a specific Redis key used by YTDLP queues.
"""
from airflow import DAG
from airflow.exceptions import AirflowException
from airflow.models.param import Param
from airflow.operators.python import PythonOperator
from airflow.providers.redis.hooks.redis import RedisHook
from airflow.utils.dates import days_ago
from datetime import datetime, timedelta, timezone
import logging
import json
import redis # Import redis exceptions if needed
# Configure logging
logger = logging.getLogger(__name__)
# Default settings
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_QUEUE_BASE_NAME = 'video_queue'
DEFAULT_MAX_ITEMS_TO_LIST = 25
# Import utility functions
from utils.redis_utils import _get_redis_client
# --- Python Callable for Check and List Task ---
def check_and_list_queue_callable(**context):
"""Checks the type and size of a Redis key and lists its recent contents."""
params = context['params']
redis_conn_id = params['redis_conn_id']
# queue_suffix is passed from the PythonOperator's op_kwargs, which are available in the context
queue_suffix = context['queue_suffix']
queue_name = params.get('queue_name', DEFAULT_QUEUE_BASE_NAME)
queue_to_check = f"{queue_name}{queue_suffix}"
max_items = int(params.get('max_items_to_list', DEFAULT_MAX_ITEMS_TO_LIST))
logger.info(f"--- Checking Status and Contents of Redis Key: '{queue_to_check}' ---")
logger.info(f"Using connection '{redis_conn_id}', listing up to {max_items} items.")
try:
redis_client = _get_redis_client(redis_conn_id)
key_type_bytes = redis_client.type(queue_to_check)
key_type = key_type_bytes.decode('utf-8')
if key_type == 'list':
list_length = redis_client.llen(queue_to_check)
logger.info(f"Redis key '{queue_to_check}' is a LIST with {list_length} items.")
if list_length > 0:
items_to_fetch = min(max_items, list_length)
# lrange with negative indices gets items from the end (most recent for rpush)
contents_bytes = redis_client.lrange(queue_to_check, -items_to_fetch, -1)
contents = [item.decode('utf-8') for item in contents_bytes]
contents.reverse() # Show most recent first
logger.info(f"--- Showing most recent {len(contents)} of {list_length} items ---")
for i, item in enumerate(contents):
logger.info(f" [recent_{i}]: {item}")
if list_length > len(contents):
logger.info(f" ... ({list_length - len(contents)} older items not shown)")
logger.info(f"--- End of List Contents ---")
elif key_type == 'hash':
hash_size = redis_client.hlen(queue_to_check)
logger.info(f"Redis key '{queue_to_check}' is a HASH with {hash_size} fields.")
if hash_size > 0:
logger.info(f"--- Showing a sample of up to {max_items} fields ---")
item_count = 0
# Using hscan_iter to safely iterate over hash fields, count is a hint
for field_bytes, value_bytes in redis_client.hscan_iter(queue_to_check, count=max_items):
if item_count >= max_items:
logger.info(f" ... (stopped listing after {max_items} items of {hash_size})")
break
field = field_bytes.decode('utf-8')
value = value_bytes.decode('utf-8')
# Try to pretty-print if value is JSON
try:
parsed_value = json.loads(value)
# Check for timestamp to show age
timestamp = parsed_value.get('end_time') or parsed_value.get('start_time')
age_str = ""
if timestamp:
age_seconds = (datetime.now(timezone.utc) - datetime.fromtimestamp(timestamp, timezone.utc)).total_seconds()
age_str = f" (age: {timedelta(seconds=age_seconds)})"
pretty_value = json.dumps(parsed_value, indent=2)
logger.info(f" Field '{field}'{age_str}:\n{pretty_value}")
except (json.JSONDecodeError, TypeError):
logger.info(f" Field '{field}': {value}")
item_count += 1
logger.info(f"--- End of Hash Contents ---")
elif key_type == 'none':
logger.info(f"Redis key '{queue_to_check}' does not exist.")
else:
logger.info(f"Redis key '{queue_to_check}' is of type '{key_type}'. Listing contents for this type is not implemented.")
except Exception as e:
logger.error(f"Failed to check/list contents of Redis key '{queue_to_check}': {e}", exc_info=True)
raise AirflowException(f"Failed to process Redis key: {e}")
# --- DAG Definition ---
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 0, # No retries for a manual check/list operation
'start_date': days_ago(1)
}
with DAG(
dag_id='ytdlp_mgmt_queues_check_status',
default_args=default_args,
schedule_interval=None, # Manually triggered
catchup=False,
description='Manually check the status and recent items of all YTDLP Redis queues for a given base name.',
tags=['ytdlp', 'queue', 'management', 'redis', 'manual', 'status', 'list'],
params={
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
'queue_name': Param(
DEFAULT_QUEUE_BASE_NAME,
type="string",
description="Base name for the Redis queues (e.g., 'video_queue')."
),
'max_items_to_list': Param(DEFAULT_MAX_ITEMS_TO_LIST, type="integer", description="Maximum number of recent items/fields to list from each queue."),
}
) as dag:
check_inbox_queue = PythonOperator(
task_id='check_inbox_queue',
python_callable=check_and_list_queue_callable,
op_kwargs={'queue_suffix': '_inbox'},
)
check_inbox_queue.doc_md = """
### Check Inbox Queue (`_inbox`)
Checks the status and lists the most recent URLs waiting to be processed.
The full queue name is `{{ params.queue_name }}_inbox`.
"""
check_progress_queue = PythonOperator(
task_id='check_progress_queue',
python_callable=check_and_list_queue_callable,
op_kwargs={'queue_suffix': '_progress'},
)
check_progress_queue.doc_md = """
### Check Progress Queue (`_progress`)
Checks the status and lists a sample of URLs currently being processed.
The full queue name is `{{ params.queue_name }}_progress`.
"""
check_result_queue = PythonOperator(
task_id='check_result_queue',
python_callable=check_and_list_queue_callable,
op_kwargs={'queue_suffix': '_result'},
)
check_result_queue.doc_md = """
### Check Result Queue (`_result`)
Checks the status and lists a sample of successfully processed URLs.
The full queue name is `{{ params.queue_name }}_result`.
"""
check_fail_queue = PythonOperator(
task_id='check_fail_queue',
python_callable=check_and_list_queue_callable,
op_kwargs={'queue_suffix': '_fail'},
)
check_fail_queue.doc_md = """
### Check Fail Queue (`_fail`)
Checks the status and lists a sample of failed URLs.
The full queue name is `{{ params.queue_name }}_fail`.
"""

View File

@ -1,343 +0,0 @@
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2024 rl <rl@rlmbp>
#
# Distributed under terms of the MIT license.
"""
DAG for processing a single YouTube URL passed via DAG run configuration.
This is the "Worker" part of a Sensor/Worker pattern.
This DAG has been refactored to use the TaskFlow API to implement worker affinity,
ensuring all tasks for a single URL run on the same machine.
"""
from __future__ import annotations
from airflow.decorators import task, task_group
from airflow.exceptions import AirflowException, AirflowSkipException
from airflow.models import Variable
from airflow.models.dag import DAG
from airflow.models.param import Param
from airflow.models.xcom_arg import XComArg
from airflow.operators.dummy import DummyOperator
from airflow.operators.bash import BashOperator
from airflow.utils.dates import days_ago
from airflow.api.common.trigger_dag import trigger_dag
from datetime import timedelta, datetime
import json
import logging
import os
import random
import re
import socket
import time
import traceback
import uuid
import subprocess
import shlex
# Import utility functions and Thrift modules
from utils.redis_utils import _get_redis_client
# Handle potential import issues with Thrift modules
try:
from pangramia.yt.common.ttypes import TokenUpdateMode
except ImportError as e:
logging.warning(f"Could not import TokenUpdateMode from pangramia.yt.common.ttypes: {e}")
TokenUpdateMode = None
try:
from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
except ImportError as e:
logging.warning(f"Could not import PBServiceException/PBUserException from pangramia.yt.exceptions.ttypes: {e}")
PBServiceException = Exception
PBUserException = Exception
try:
from pangramia.yt.tokens_ops import YTTokenOpService
except ImportError as e:
logging.warning(f"Could not import YTTokenOpService from pangramia.yt.tokens_ops: {e}")
YTTokenOpService = None
try:
from thrift.protocol import TBinaryProtocol
from thrift.transport import TSocket, TTransport
from thrift.transport.TTransport import TTransportException
except ImportError as e:
logging.warning(f"Could not import thrift modules: {e}")
TBinaryProtocol = None
TSocket = None
TTransport = None
TTransportException = Exception
# Configure logging
logger = logging.getLogger(__name__)
# Default settings from Airflow Variables or hardcoded fallbacks
DEFAULT_QUEUE_NAME = 'video_queue'
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_TIMEOUT = 3600
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
# The queue is set to a fallback here. The actual worker-specific queue is
# assigned just-in-time by the task_instance_mutation_hook in airflow_local_settings.py,
# which reads the 'worker_queue' from the DAG run configuration.
DEFAULT_ARGS = {
'owner': 'airflow',
'retries': 0,
'queue': 'queue-dl', # Fallback queue. Will be overridden by the policy hook.
}
# --- Helper Functions ---
def _get_thrift_client(host, port, timeout):
"""Helper to create and connect a Thrift client."""
if not TSocket or not TTransport or not TBinaryProtocol:
raise AirflowException("Required Thrift modules are not available")
transport = TSocket.TSocket(host, port)
transport.setTimeout(timeout * 1000)
transport = TTransport.TFramedTransport(transport)
protocol = TBinaryProtocol.TBinaryProtocolFactory()
client = YTTokenOpService.Client(protocol) if YTTokenOpService else None
if client:
transport.open()
logger.info(f"Connected to Thrift server at {host}:{port}")
return client, transport
def _extract_video_id(url):
"""Extracts YouTube video ID from URL."""
if not url or not isinstance(url, str):
return None
patterns = [r'v=([a-zA-Z0-9_-]{11})', r'youtu\.be/([a-zA-Z0-9_-]{11})']
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def _get_account_pool(params: dict) -> list:
"""
Gets the list of accounts to use for processing, filtering out banned/resting accounts.
Supports explicit list, prefix-based generation, and single account modes.
"""
account_pool_str = params.get('account_pool', 'default_account')
accounts = []
is_prefix_mode = False
if ',' in account_pool_str:
accounts = [acc.strip() for acc in account_pool_str.split(',') if acc.strip()]
else:
prefix = account_pool_str
pool_size_param = params.get('account_pool_size')
if pool_size_param is not None:
is_prefix_mode = True
pool_size = int(pool_size_param)
accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
else:
accounts = [prefix]
if not accounts:
raise AirflowException("Initial account pool is empty.")
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
try:
redis_client = _get_redis_client(redis_conn_id)
active_accounts = []
for account in accounts:
status_bytes = redis_client.hget(f"account_status:{account}", "status")
status = status_bytes.decode('utf-8') if status_bytes else "ACTIVE"
if status not in ['BANNED'] and 'RESTING' not in status:
active_accounts.append(account)
if not active_accounts and accounts:
auto_create = params.get('auto_create_new_accounts_on_exhaustion', False)
if auto_create and is_prefix_mode:
new_account_id = f"{account_pool_str}-auto-{str(uuid.uuid4())[:8]}"
logger.warning(f"Account pool exhausted. Auto-creating new account: '{new_account_id}'")
active_accounts.append(new_account_id)
else:
raise AirflowException("All accounts in the configured pool are currently exhausted.")
accounts = active_accounts
except Exception as e:
logger.error(f"Could not filter accounts from Redis. Using unfiltered pool. Error: {e}", exc_info=True)
if not accounts:
raise AirflowException("Account pool is empty after filtering.")
logger.info(f"Final active account pool with {len(accounts)} accounts.")
return accounts
# =============================================================================
# TASK DEFINITIONS (TaskFlow API)
# =============================================================================
@task
def get_url_and_assign_account(**context):
"""
Gets the URL to process from the DAG run configuration and assigns an active account.
This is the first task in the pinned-worker DAG.
"""
params = context['params']
# Update yt-dlp to latest nightly before every run
subprocess.run(["/usr/local/bin/update-yt-dlp.sh"], check=True)
# The URL is passed by the dispatcher DAG.
url_to_process = params.get('url_to_process')
if not url_to_process:
raise AirflowException("'url_to_process' was not found in the DAG run configuration.")
logger.info(f"Received URL '{url_to_process}' to process.")
# Account assignment logic is the same as before.
account_id = random.choice(_get_account_pool(params))
logger.info(f"Selected account '{account_id}' for this run.")
return {
'url_to_process': url_to_process,
'account_id': account_id,
'accounts_tried': [account_id],
}
@task
def get_token(initial_data: dict, **context):
"""Makes a single attempt to get a token from the Thrift service."""
ti = context['task_instance']
params = context['params']
account_id = initial_data['account_id']
url = initial_data['url_to_process']
info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')
host, port, timeout = params['service_ip'], int(params['service_port']), int(params.get('timeout', DEFAULT_TIMEOUT))
machine_id = params.get('machine_id') or socket.gethostname()
logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' ---")
client, transport = None, None
try:
client, transport = _get_thrift_client(host, port, timeout)
if not client or not TokenUpdateMode:
raise AirflowException("Thrift client or TokenUpdateMode not available")
token_data = client.getOrRefreshToken(accountId=account_id, updateType=TokenUpdateMode.AUTO, url=url, clients=params.get('clients'), machineId=machine_id)
info_json = getattr(token_data, 'infoJson', None)
if not (info_json and json.loads(info_json)):
raise AirflowException("Service returned success but info.json was empty or invalid.")
video_id = _extract_video_id(url)
os.makedirs(info_json_dir, exist_ok=True)
# Use a readable timestamp for a unique filename on each attempt.
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
info_json_path = os.path.join(info_json_dir, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json")
with open(info_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
return {
'info_json_path': info_json_path,
'socks_proxy': getattr(token_data, proxy_attr) if proxy_attr else None,
'ytdlp_command': getattr(token_data, proxy_attr) if proxy_attr else None,
'successful_account_id': account_id,
'original_url': url, # Include original URL for fallback
}
except (PBServiceException, PBUserException, TTransportException) as e:
error_context = getattr(e, 'context', None)
if isinstance(error_context, str):
try: error_context = json.loads(error_context.replace("'", "\""))
except: pass
error_details = {
'error_message': getattr(e, 'message', str(e)),
'error_code': getattr(e, 'errorCode', 'TRANSPORT_ERROR'),
'proxy_url': error_context.get('proxy_url') if isinstance(error_context, dict) else None
}
logger.error(f"Thrift call failed for account '{account_id}'. Exception: {error_details['error_message']}")
ti.xcom_push(key='error_details', value=error_details)
# If it's not a connection error, run diagnostic yt-dlp command
if error_details['error_code'] not in ["SOCKS5_CONNECTION_FAILED", "SOCKET_TIMEOUT", "TRANSPORT_ERROR", "CAMOUFOX_TIMEOUT"]:
_run_diagnostic_yt_dlp(url, error_details.get('proxy_url'), params.get('clients', 'web'))
raise AirflowException(f"Thrift call failed: {error_details['error_message']}")
finally:
if transport and transport.isOpen():
transport.close()
def _run_diagnostic_yt_dlp(url, proxy, clients):
"""Runs yt-dlp with diagnostic flags to capture failed responses."""
logger.warning("Running diagnostic yt-dlp command to capture failed response...")
dump_dir = "/opt/airflow/dumps"
os.makedirs(dump_dir, exist_ok=True)
video_id = _extract_video_id(url)
dump_file = os.path.join(dump_dir, f"diagnostic_{video_id}_{int(time.time())}.dump")
cmd = [
'yt-dlp',
'--extractor-args', f'youtube:player-client={clients}',
'--write-pages',
'--proxy', proxy or '',
'-FvU',
url,
'--write-info-json',
'--print', 'filename',
'--continue',
'--no-progress',
'--no-simulate',
'--ignore-errors',
'--no-playlist'
]
logger.info(f"Executing diagnostic command: {' '.join(shlex.quote(arg) for arg in cmd)}")
logger.info(f"Diagnostic dump will be saved to: {dump_file}")
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
logger.info(f"Diagnostic yt-dlp exit code: {result.returncode}")
if result.stdout:
logger.info(f"Diagnostic output:\n{result.stdout}")
if result.stderr:
logger.error(f"Diagnostic stderr:\n{result.stderr}")
except subprocess.TimeoutExpired:
logger.error("Diagnostic yt-dlp command timed out after 5 minutes")
except Exception as e:
logger.error(f"Failed to run diagnostic yt-dlp: {e}")
@task.branch
def handle_bannable_error_branch(task_id_to_check: str, **context):
"""Inspects a failed task and routes to retry logic if the error is bannable."""
ti = context['task_instance']
params = context['params']
error_details = ti.xcom_pull(task_ids=task_id_to_check, key='error_details')
if not error_details:
return None # Let DAG fail for unexpected errors
error_code = error_details.get('error_code', '').strip()
policy = params.get('on_bannable_failure', 'retry_with_new_account')
# Connection errors should be retried without banning the account.
connection_errors = ['SOCKS5_CONNECTION_FAILED', 'SOCKET_TIMEOUT', 'TRANSPORT_ERROR', 'CAMOUFOX_TIMEOUT']
if error_code in connection_errors:
logger.info(f"Handling connection error '{error_code}' from '{task_id_to_check}'. Policy: '{policy}'")
if policy == 'stop_loop':
logger.warning(f"Connection error with 'stop_loop' policy. Failing DAG without banning.")
return None
else:
logger.info("Retrying with a new account without banning.")
return 'assign_new_account_for_retry'
is_bannable = error_code in ["BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED"]
logger.info(f"Handling failure from '{task_id_to_check}'. Error code: '{error_code}', Policy: '{policy}'")
if is_bannable and policy in ['retry_with_new_account', 'retry_and_ban_account_only']:
return 'ban_account_and_prepare_for_retry'
if is_bannable and policy in ['retry_on_connection_error', 'retry_without_ban']:
return 'assign_new_account_for_retry'
if is_bannable: # stop_loop
return 'ban_and_fail'
return None # Not a bannable error, let DAG fail

View File

@ -1,707 +0,0 @@
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2024 rl <rl@rlmbp>
#
# Distributed under terms of the MIT license.
"""
DAG for processing YouTube URLs sequentially from a Redis queue using YTDLP Ops Thrift service.
"""
from airflow import DAG
from airflow.exceptions import AirflowException, AirflowSkipException, AirflowFailException
from airflow.hooks.base import BaseHook
from airflow.models import BaseOperator, Variable
from airflow.models.param import Param
from airflow.operators.bash import BashOperator # Import BashOperator
from airflow.operators.python import PythonOperator
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
from airflow.providers.redis.hooks.redis import RedisHook
from airflow.utils.dates import days_ago
from airflow.utils.decorators import apply_defaults
from datetime import datetime, timedelta
from pangramia.yt.common.ttypes import TokenUpdateMode
from pangramia.yt.exceptions.ttypes import PBServiceException
from pangramia.yt.tokens_ops import YTTokenOpService
from thrift.protocol import TBinaryProtocol
from thrift.transport import TSocket, TTransport
from thrift.transport.TTransport import TTransportException
import json
import logging
import os
import redis # Import redis exceptions if needed
import socket
import time
import traceback # For logging stack traces in failure handler
# Configure logging
logger = logging.getLogger(__name__)
# Default settings
DEFAULT_QUEUE_NAME = 'video_queue' # Base name for queues
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_TIMEOUT = 30 # Default Thrift timeout in seconds
MAX_RETRIES_REDIS_LOOKUP = 3 # Retries for fetching service details from Redis
RETRY_DELAY_REDIS_LOOKUP = 10 # Delay (seconds) for Redis lookup retries
# --- Helper Functions ---
from utils.redis_utils import _get_redis_client
def _extract_video_id(url):
"""Extracts YouTube video ID from URL."""
if not url or not isinstance(url, str):
logger.debug("URL is empty or not a string, cannot extract video ID.")
return None
try:
video_id = None
if 'youtube.com/watch?v=' in url:
video_id = url.split('v=')[1].split('&')[0]
elif 'youtu.be/' in url:
video_id = url.split('youtu.be/')[1].split('?')[0]
if video_id and len(video_id) >= 11:
video_id = video_id[:11] # Standard ID length
logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
return video_id
else:
logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
return None
except Exception as e:
logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
return None
# --- Queue Management Callables ---
def pop_url_from_queue(**context):
"""Pops a URL from the inbox queue and pushes to XCom."""
params = context['params']
queue_name = params['queue_name']
inbox_queue = f"{queue_name}_inbox"
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
logger.info(f"Attempting to pop URL from inbox queue: {inbox_queue}")
try:
client = _get_redis_client(redis_conn_id)
# LPOP is non-blocking, returns None if empty
url_bytes = client.lpop(inbox_queue) # Returns bytes if decode_responses=False on hook/client
if url_bytes:
url = url_bytes.decode('utf-8') if isinstance(url_bytes, bytes) else url_bytes
logger.info(f"Popped URL: {url}")
context['task_instance'].xcom_push(key='current_url', value=url)
return url # Return URL for logging/potential use
else:
logger.info(f"Inbox queue '{inbox_queue}' is empty. Skipping downstream tasks.")
context['task_instance'].xcom_push(key='current_url', value=None)
# Raise AirflowSkipException to signal downstream tasks to skip
raise AirflowSkipException(f"Inbox queue '{inbox_queue}' is empty.")
except AirflowSkipException:
raise # Re-raise skip exception
except Exception as e:
logger.error(f"Error popping URL from Redis queue '{inbox_queue}': {e}", exc_info=True)
raise AirflowException(f"Failed to pop URL from Redis: {e}")
def move_url_to_progress(**context):
"""Moves the current URL from XCom to the progress hash."""
ti = context['task_instance']
url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url')
# This task should be skipped if pop_url_from_queue raised AirflowSkipException
# Adding check for robustness
if not url:
logger.info("No URL found in XCom (or upstream skipped). Skipping move to progress.")
raise AirflowSkipException("No URL to process.")
params = context['params']
queue_name = params['queue_name']
progress_queue = f"{queue_name}_progress"
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
logger.info(f"Moving URL '{url}' to progress hash: {progress_queue}")
progress_data = {
'status': 'processing',
'start_time': time.time(),
'dag_run_id': context['dag_run'].run_id,
'task_instance_key_str': context['task_instance_key_str']
}
try:
client = _get_redis_client(redis_conn_id)
client.hset(progress_queue, url, json.dumps(progress_data))
logger.info(f"Moved URL '{url}' to progress hash '{progress_queue}'.")
except Exception as e:
logger.error(f"Error moving URL to Redis progress hash '{progress_queue}': {e}", exc_info=True)
# If this fails, the URL is popped but not tracked as processing. Fail the task.
raise AirflowException(f"Failed to move URL to progress hash: {e}")
def handle_success(**context):
"""Moves URL from progress to result hash on success."""
ti = context['task_instance']
url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url')
if not url:
logger.warning("handle_success called but no URL found from pop_url_from_queue XCom. This shouldn't happen on success path.")
return # Or raise error
params = context['params']
queue_name = params['queue_name']
progress_queue = f"{queue_name}_progress"
result_queue = f"{queue_name}_result"
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
# Pull results from get_token task
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command') # Original command
downloaded_file_path = ti.xcom_pull(task_ids='download_video') # Pull from download_video task
logger.info(f"Handling success for URL: {url}")
logger.info(f" Info JSON Path: {info_json_path}")
logger.info(f" SOCKS Proxy: {socks_proxy}")
logger.info(f" YTDLP Command: {ytdlp_command[:100] if ytdlp_command else 'None'}...") # Log truncated command
logger.info(f" Downloaded File Path: {downloaded_file_path}")
result_data = {
'status': 'success',
'end_time': time.time(),
'info_json_path': info_json_path,
'socks_proxy': socks_proxy,
'ytdlp_command': ytdlp_command,
'downloaded_file_path': downloaded_file_path,
'url': url,
'dag_run_id': context['dag_run'].run_id,
'task_instance_key_str': context['task_instance_key_str'] # Record which task instance succeeded
}
try:
client = _get_redis_client(redis_conn_id)
# Remove from progress hash
removed_count = client.hdel(progress_queue, url)
if removed_count > 0:
logger.info(f"Removed URL '{url}' from progress hash '{progress_queue}'.")
else:
logger.warning(f"URL '{url}' not found in progress hash '{progress_queue}' during success handling.")
# Add to result hash
client.hset(result_queue, url, json.dumps(result_data))
logger.info(f"Stored success result for URL '{url}' in result hash '{result_queue}'.")
except Exception as e:
logger.error(f"Error handling success in Redis for URL '{url}': {e}", exc_info=True)
# Even if Redis fails, the task succeeded. Log error but don't fail the task.
# Consider adding retry logic for Redis operations here or marking state differently.
def handle_failure(**context):
"""
Handles failed processing. Depending on the `requeue_on_failure` parameter,
it either moves the URL to the fail hash or re-queues it in the inbox.
If `stop_on_failure` is True, this task will fail, stopping the DAG loop.
"""
ti = context['task_instance']
url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url')
if not url:
logger.error("handle_failure called but no URL found from pop_url_from_queue XCom.")
return
params = context['params']
queue_name = params['queue_name']
progress_queue = f"{queue_name}_progress"
fail_queue = f"{queue_name}_fail"
inbox_queue = f"{queue_name}_inbox"
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
requeue_on_failure = params.get('requeue_on_failure', False)
stop_on_failure = params.get('stop_on_failure', True) # Default to True
exception = context.get('exception')
error_message = str(exception) if exception else "Unknown error"
tb_str = traceback.format_exc() if exception else "No traceback available."
logger.info(f"Handling failure for URL: {url}")
logger.error(f" Failure Reason: {error_message}")
logger.debug(f" Traceback:\n{tb_str}")
try:
client = _get_redis_client(redis_conn_id)
# Always remove from progress hash first
removed_count = client.hdel(progress_queue, url)
if removed_count > 0:
logger.info(f"Removed URL '{url}' from progress hash '{progress_queue}'.")
else:
logger.warning(f"URL '{url}' not found in progress hash '{progress_queue}' during failure handling.")
if requeue_on_failure:
# Re-queue the URL for another attempt
client.rpush(inbox_queue, url)
logger.info(f"Re-queued failed URL '{url}' to inbox '{inbox_queue}' for retry.")
else:
# Move to the permanent fail hash
fail_data = {
'status': 'failed',
'end_time': time.time(),
'error': error_message,
'traceback': tb_str,
'url': url,
'dag_run_id': context['dag_run'].run_id,
'task_instance_key_str': context['task_instance_key_str']
}
client.hset(fail_queue, url, json.dumps(fail_data))
logger.info(f"Stored failure details for URL '{url}' in fail hash '{fail_queue}'.")
except Exception as e:
logger.error(f"Error during failure handling in Redis for URL '{url}': {e}", exc_info=True)
# This is a critical error in the failure handling logic itself.
raise AirflowException(f"Could not handle failure in Redis: {e}")
# After handling Redis, decide whether to fail the task to stop the loop
if stop_on_failure:
logger.error("stop_on_failure is True. Failing this task to stop the DAG loop.")
# Re-raise the original exception to fail the task instance.
# This is better than AirflowFailException because it preserves the original error.
if exception:
raise exception
else:
# If for some reason there's no exception, fail explicitly.
raise AirflowFailException("Failing task as per stop_on_failure=True, but original exception was not found.")
# --- YtdlpOpsOperator ---
class YtdlpOpsOperator(BaseOperator):
"""
Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections
and Redis-based discovery, retrieves tokens, saves info.json, and manages errors.
Modified to pull URL from XCom for sequential processing.
"""
# Removed 'url' from template_fields as it's pulled from XCom
template_fields = ('service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir', 'redis_conn_id')
@apply_defaults
def __init__(self,
# url parameter removed - will be pulled from XCom
redis_conn_id=DEFAULT_REDIS_CONN_ID,
max_retries_lookup=MAX_RETRIES_REDIS_LOOKUP,
retry_delay_lookup=RETRY_DELAY_REDIS_LOOKUP,
service_ip=None,
service_port=None,
redis_enabled=False, # Default to direct connection now
account_id=None,
# save_info_json removed, always True
info_json_dir=None,
# get_socks_proxy removed, always True
# store_socks_proxy removed, always True
# get_socks_proxy=True, # Removed
# store_socks_proxy=True, # Store proxy in XCom by default # Removed
timeout=DEFAULT_TIMEOUT,
*args, **kwargs):
super().__init__(*args, **kwargs)
logger.info(f"Initializing YtdlpOpsOperator (Processor Version) with parameters: "
f"redis_conn_id={redis_conn_id}, max_retries_lookup={max_retries_lookup}, retry_delay_lookup={retry_delay_lookup}, "
f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, "
f"account_id={account_id}, info_json_dir={info_json_dir}, timeout={timeout}")
# save_info_json, get_socks_proxy, store_socks_proxy removed from log
# Validate parameters based on connection mode
if redis_enabled:
# If using Redis, account_id is essential for lookup
if not account_id:
raise ValueError("account_id is required when redis_enabled=True for service lookup.")
else:
# If direct connection, IP and Port are essential
if not service_ip or not service_port:
raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False.")
# Account ID is still needed for the API call itself, but rely on DAG param or operator config
if not account_id:
logger.warning("No account_id provided for direct connection mode. Ensure it's set in DAG params or operator config.")
# We won't assign 'default' here, let the value passed during instantiation be used.
# self.url is no longer needed here
self.redis_conn_id = redis_conn_id
self.max_retries_lookup = max_retries_lookup
self.retry_delay_lookup = int(retry_delay_lookup.total_seconds() if isinstance(retry_delay_lookup, timedelta) else retry_delay_lookup)
self.service_ip = service_ip
self.service_port = service_port
self.redis_enabled = redis_enabled
self.account_id = account_id
# self.save_info_json removed
self.info_json_dir = info_json_dir # Still needed
# self.get_socks_proxy removed
# self.store_socks_proxy removed
self.timeout = timeout
def execute(self, context):
logger.info("Executing YtdlpOpsOperator (Processor Version)")
transport = None
ti = context['task_instance'] # Get task instance for XCom access
try:
# --- Get URL from XCom ---
url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url')
if not url:
# This should ideally be caught by upstream skip, but handle defensively
logger.info("No URL found in XCom from pop_url_from_queue. Skipping execution.")
raise AirflowSkipException("Upstream task did not provide a URL.")
logger.info(f"Processing URL from XCom: {url}")
# --- End Get URL ---
logger.info("Getting task parameters and rendering templates")
params = context['params'] # DAG run params
# Render template fields using context
# Use render_template_as_native for better type handling if needed, else render_template
redis_conn_id = self.render_template(self.redis_conn_id, context)
service_ip = self.render_template(self.service_ip, context)
service_port_rendered = self.render_template(self.service_port, context)
account_id = self.render_template(self.account_id, context)
timeout_rendered = self.render_template(self.timeout, context)
info_json_dir = self.render_template(self.info_json_dir, context) # Rendered here for _save_info_json
# Determine effective settings (DAG params override operator defaults)
redis_enabled = params.get('redis_enabled', self.redis_enabled)
account_id = params.get('account_id', account_id) # Use DAG param if provided
redis_conn_id = params.get('redis_conn_id', redis_conn_id) # Use DAG param if provided
logger.info(f"Effective settings: redis_enabled={redis_enabled}, account_id='{account_id}', redis_conn_id='{redis_conn_id}'")
host = None
port = None
if redis_enabled:
# Get Redis connection using the helper for consistency
redis_client = _get_redis_client(redis_conn_id)
logger.info(f"Successfully connected to Redis using connection '{redis_conn_id}' for service discovery.")
# Get service details from Redis with retries
service_key = f"ytdlp:{account_id}"
legacy_key = account_id # For backward compatibility
for attempt in range(self.max_retries_lookup):
try:
logger.info(f"Attempt {attempt + 1}/{self.max_retries_lookup}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'")
service_details = redis_client.hgetall(service_key)
if not service_details:
logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'")
service_details = redis_client.hgetall(legacy_key)
if not service_details:
raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}")
# Find IP and port (case-insensitive keys)
ip_key = next((k for k in service_details if k.lower() == 'ip'), None)
port_key = next((k for k in service_details if k.lower() == 'port'), None)
if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}")
if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}")
host = service_details[ip_key] # Assumes decode_responses=True in hook
port_str = service_details[port_key]
try:
port = int(port_str)
except (ValueError, TypeError):
raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}")
logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}")
break # Success
except Exception as e:
logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}")
if attempt == self.max_retries_lookup - 1:
logger.error("Max retries reached for fetching Redis details.")
raise AirflowException(f"Failed to get service details from Redis after {self.max_retries_lookup} attempts: {e}")
logger.info(f"Retrying in {self.retry_delay_lookup} seconds...")
time.sleep(self.retry_delay_lookup)
else:
# Direct connection: Use rendered/param values
host = params.get('service_ip', service_ip) # Use DAG param if provided
port_str = params.get('service_port', service_port_rendered) # Use DAG param if provided
logger.info(f"Using direct connection settings: service_ip={host}, service_port={port_str}")
if not host or not port_str:
raise ValueError("Direct connection requires service_ip and service_port (check Operator config and DAG params)")
try:
port = int(port_str)
except (ValueError, TypeError):
raise ValueError(f"Invalid service_port value: {port_str}")
logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)")
# Validate and use timeout
try:
timeout = int(timeout_rendered)
if timeout <= 0: raise ValueError("Timeout must be positive")
logger.info(f"Using timeout: {timeout} seconds")
except (ValueError, TypeError):
logger.warning(f"Invalid timeout value: '{timeout_rendered}'. Using default: {DEFAULT_TIMEOUT}")
timeout = DEFAULT_TIMEOUT
# Create Thrift connection objects
# socket_conn = TSocket.TSocket(host, port) # Original
socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4)
socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds
transport = TTransport.TFramedTransport(socket_conn) # Use TFramedTransport if server expects it
# transport = TTransport.TBufferedTransport(socket_conn) # Use TBufferedTransport if server expects it
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = YTTokenOpService.Client(protocol)
logger.info(f"Attempting to connect to Thrift server at {host}:{port}...")
try:
transport.open()
logger.info("Successfully connected to Thrift server.")
# Test connection with ping
try:
client.ping()
logger.info("Server ping successful.")
except Exception as e:
logger.error(f"Server ping failed: {e}")
raise AirflowException(f"Server connection test (ping) failed: {e}")
# Get token from service using the URL from XCom
try:
logger.info(f"Requesting token for accountId='{account_id}', url='{url}'")
token_data = client.getOrRefreshToken(
accountId=account_id,
updateType=TokenUpdateMode.AUTO,
url=url # Use the url variable from XCom
)
logger.info("Successfully retrieved token data from service.")
except PBServiceException as e:
# Handle specific service exceptions
error_code = getattr(e, 'errorCode', 'N/A')
error_message = getattr(e, 'message', 'N/A')
error_context = getattr(e, 'context', {})
logger.error(f"PBServiceException occurred: Code={error_code}, Message={error_message}")
if error_context:
logger.error(f" Context: {error_context}") # Log context separately
# Construct a concise error message for AirflowException
error_msg = f"YTDLP service error (Code: {error_code}): {error_message}"
# Add specific error code handling if needed...
logger.error(f"Failing task instance due to PBServiceException: {error_msg}") # Add explicit log before raising
raise AirflowException(error_msg) # Fail task on service error
except TTransportException as e:
logger.error(f"Thrift transport error during getOrRefreshToken: {e}")
logger.error(f"Failing task instance due to TTransportException: {e}") # Add explicit log before raising
raise AirflowException(f"Transport error during API call: {e}")
except Exception as e:
logger.error(f"Unexpected error during getOrRefreshToken: {e}")
logger.error(f"Failing task instance due to unexpected error during API call: {e}") # Add explicit log before raising
raise AirflowException(f"Unexpected error during API call: {e}")
except TTransportException as e:
# Handle connection errors
logger.error(f"Thrift transport error during connection: {str(e)}")
logger.error(f"Failing task instance due to TTransportException during connection: {e}") # Add explicit log before raising
raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}")
# Removed the overly broad except Exception block here, as inner blocks raise AirflowException
# --- Process Token Data ---
logger.debug(f"Token data received. Attributes: {dir(token_data)}")
info_json_path = None # Initialize
# save_info_json is now always True
logger.info("Proceeding to save info.json (save_info_json=True).")
info_json = self._get_info_json(token_data)
if info_json and self._is_valid_json(info_json):
try:
# Pass rendered info_json_dir to helper
info_json_path = self._save_info_json(context, info_json, url, account_id, info_json_dir)
if info_json_path:
ti.xcom_push(key='info_json_path', value=info_json_path)
logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}")
else:
ti.xcom_push(key='info_json_path', value=None)
logger.warning("info.json saving failed (check logs from _save_info_json).")
except Exception as e:
logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True)
ti.xcom_push(key='info_json_path', value=None)
elif info_json:
logger.warning("Retrieved infoJson is not valid JSON. Skipping save.")
ti.xcom_push(key='info_json_path', value=None)
else:
logger.info("No infoJson found in token data. Skipping save.")
ti.xcom_push(key='info_json_path', value=None)
# Extract and potentially store SOCKS proxy
# get_socks_proxy and store_socks_proxy are now always True
socks_proxy = None
logger.info("Attempting to extract SOCKS proxy (get_socks_proxy=True).")
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
if proxy_attr:
socks_proxy = getattr(token_data, proxy_attr)
if socks_proxy:
logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
# Always store if found (store_socks_proxy=True)
ti.xcom_push(key='socks_proxy', value=socks_proxy)
logger.info("Pushed 'socks_proxy' to XCom.")
else:
logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty.")
# Store None if attribute found but empty
ti.xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.")
else:
logger.info("No SOCKS proxy attribute found in token data.")
# Store None if attribute not found
ti.xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.")
# --- Removed old logic block ---
# # Extract and potentially store SOCKS proxy
# socks_proxy = None
# get_socks_proxy = params.get('get_socks_proxy', self.get_socks_proxy)
# store_socks_proxy = params.get('store_socks_proxy', self.store_socks_proxy)
#
# if get_socks_proxy:
# proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
# if proxy_attr:
# socks_proxy = getattr(token_data, proxy_attr)
# if socks_proxy:
# logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
# if store_socks_proxy:
# ti.xcom_push(key='socks_proxy', value=socks_proxy)
# logger.info("Pushed 'socks_proxy' to XCom.")
# else:
# logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty.")
# if store_socks_proxy: ti.xcom_push(key='socks_proxy', value=None)
# else:
# logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found.")
# if store_socks_proxy: ti.xcom_push(key='socks_proxy', value=None)
# else:
# logger.info("get_socks_proxy is False. Skipping proxy extraction.")
# if store_socks_proxy: ti.xcom_push(key='socks_proxy', value=None)
# --- End Removed old logic block ---
# Get the original command from the server, or construct a fallback
ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
if ytdlp_cmd:
logger.info(f"Original command received from server: {ytdlp_cmd[:100]}...") # Log truncated
else:
logger.warning("No 'ytdlpCommand' attribute found in token data. Constructing a fallback for logging.")
# Construct a representative command for logging purposes
if socks_proxy:
ytdlp_cmd = f"yt-dlp --dump-json --proxy \"{socks_proxy}\" \"{url}\""
else:
ytdlp_cmd = f"yt-dlp --dump-json \"{url}\""
logger.info(f"Constructed fallback command: {ytdlp_cmd}")
# Push the command to XCom
ti.xcom_push(key='ytdlp_command', value=ytdlp_cmd)
logger.info("Pushed command to XCom key 'ytdlp_command'.")
# No explicit return needed, success is implicit if no exception raised
except (AirflowSkipException, AirflowFailException) as e:
logger.info(f"Task skipped or failed explicitly: {e}")
raise # Re-raise to let Airflow handle state
except AirflowException as e: # Catch AirflowExceptions raised explicitly
logger.error(f"Operation failed due to AirflowException: {e}", exc_info=True)
raise # Re-raise AirflowExceptions to ensure task failure
except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already handled inside inner try
logger.error(f"Unhandled YTDLP Service/Transport error in outer block: {e}", exc_info=True)
logger.error(f"Failing task instance due to unhandled outer Service/Transport error: {e}") # Add explicit log before raising
raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException to fail task
except Exception as e: # General catch-all for truly unexpected errors
logger.error(f"Caught unexpected error in YtdlpOpsOperator outer block: {e}", exc_info=True)
logger.error(f"Failing task instance due to unexpected outer error: {e}") # Add explicit log before raising
raise AirflowException(f"Unexpected error caused task failure: {e}") # Wrap to fail task
finally:
if transport and transport.isOpen():
logger.info("Closing Thrift transport.")
transport.close()
# --- Helper Methods ---
def _get_info_json(self, token_data):
"""Safely extracts infoJson from token data."""
return getattr(token_data, 'infoJson', None)
def _is_valid_json(self, json_str):
"""Checks if a string is valid JSON."""
if not json_str or not isinstance(json_str, str): return False
try:
json.loads(json_str)
return True
except json.JSONDecodeError:
return False
def _save_info_json(self, context, info_json, url, account_id, rendered_info_json_dir):
"""Saves info_json to a file. Uses pre-rendered directory path."""
try:
video_id = _extract_video_id(url) # Use standalone helper
save_dir = rendered_info_json_dir or "." # Use rendered path
logger.info(f"Target directory for info.json: {save_dir}")
# Ensure directory exists
try:
os.makedirs(save_dir, exist_ok=True)
logger.info(f"Ensured directory exists: {save_dir}")
except OSError as e:
logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.")
return None
# Construct filename
timestamp = int(time.time())
base_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json"
info_json_path = os.path.join(save_dir, base_filename)
latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy
# Write to timestamped file
try:
logger.info(f"Writing info.json content (received from service) to {info_json_path}...")
with open(info_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}")
except IOError as e:
logger.error(f"Failed to write info.json to {info_json_path}: {e}")
return None
# Write to latest.json (overwrite) - best effort
try:
with open(latest_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
logger.info(f"Updated latest.json file: {latest_json_path}")
except IOError as e:
logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}")
return info_json_path
except Exception as e:
logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True)
return None
# =============================================================================
# DAG Definition
# =============================================================================
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1, # Default retries for tasks like queue management
'retry_delay': timedelta(minutes=1),
'start_date': days_ago(1),
# Add concurrency control if needed for sequential processing
# 'concurrency': 1, # Ensure only one task instance runs at a time per DAG run
# 'max_active_runs': 1, # Ensure only one DAG run is active
}
# Define DAG
#
# --- DAG Block Deactivated on 2025-07-16 ---
# This DAG has been replaced by the Sensor/Worker pattern implemented in:
# - ytdlp_sensor_redis_queue.py (polls the queue)
# - ytdlp_worker_per_url.py (processes a single URL)
# This code is kept for reference but is not active.
#

View File

@ -1,974 +0,0 @@
"""
DAG to deploy and manage YTDLP token service.
This DAG handles the deployment, monitoring, and cleanup of a YTDLP token service
for a given account. It supports both Redis-based service discovery and direct
connection via manually specified host and port.
Configuration Options:
- account_id: (Required) The account ID for which the service is being deployed.
- proxy: (Optional) The proxy to use for the service.
- redis_enabled: (Optional, default=True) Whether to use Redis for service discovery.
If False, you must provide `host` and `port` manually.
- host: (Optional) The host IP of the service. Required if `redis_enabled=False`.
- port: (Optional) The port of the service. Required if `redis_enabled=False`.
Usage:
1. Redis-based service discovery:
- Set `redis_enabled=True` (default).
- Ensure Redis is configured in Airflow connections.
- The DAG will automatically discover the service IP and port from Redis.
2. Manual host and port:
- Set `redis_enabled=False`.
- Provide `host` and `port` manually in the DAG configuration.
- Example: {"host": "192.168.1.100", "port": 9090}.
Example Trigger Configuration:
{
"account_id": "test_account",
"proxy": "socks5://proxy.example.com:1080",
"redis_enabled": False,
"host": "192.168.1.100",
"port": 9090
}
"""
from airflow import DAG
from airflow.models.param import Param
from airflow.operators.empty import EmptyOperator
from airflow.operators.python import PythonOperator
# HttpSensor is no longer used
# from airflow.providers.http.sensors.http import HttpSensor
from airflow.utils.trigger_rule import TriggerRule
from airflow.hooks.base import BaseHook
from airflow.exceptions import AirflowException
from typing import Sequence # Add Sequence for type hinting
from datetime import datetime, timedelta
from airflow.utils.dates import days_ago # Add this import
import uuid
import os
import logging
import shutil
import docker
import uuid
import redis
import requests
import socket
import time
import sys # Import sys for maxsize
from airflow.configuration import conf # Import conf
# Import and apply Thrift exceptions patch
try:
# Always apply the patch, regardless of environment
from thrift_exceptions_patch import patch_thrift_exceptions
patch_thrift_exceptions()
logging.info("Applied Thrift exceptions patch for Airflow compatibility")
# Verify the patch was applied correctly
try:
from pangramia.yt.exceptions.ttypes import PBServiceException
test_exception = PBServiceException(message="Test")
# Try to modify attributes to verify patch works
test_exception.args = ("Test",)
test_exception.message = "Modified test"
logging.info("Verified Thrift exception patch is working correctly")
except Exception as verify_error:
logging.error(f"Thrift exception patch verification failed: {verify_error}")
logging.error("This may cause 'immutable instance' errors during error handling")
except ImportError as e:
logging.warning(f"Could not import thrift_exceptions_patch: {e}")
logging.warning("Airflow compatibility will be affected - expect 'immutable instance' errors")
except Exception as e:
logging.error(f"Error applying Thrift exceptions patch: {e}")
# Default arguments for the DAG
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 0, # Disable retries for all tasks in this DAG
'retry_delay': timedelta(minutes=5),
# Removed 'queue': 'auth_queue' to use the default queue
# Optional: Further filter workers by tags if using CeleryExecutor
'executor_config': {"CeleryExecutor": {"tags": ["auth_node"]}},
}
def get_redis_connection(redis_host=None, redis_port=None):
"""Get a Redis connection using Airflow's Redis connection or manually specified host/port."""
if redis_host and redis_port:
# Use manually specified host and port
return redis.Redis(
host=redis_host,
port=redis_port,
db=0,
decode_responses=True
)
else:
# Use Airflow's Redis connection
redis_conn = BaseHook.get_connection("redis_default")
# Use the password from the connection if available, otherwise use 'airflow' as default
password = redis_conn.password or 'airflow'
return redis.Redis(
host=redis_conn.host, # 'redis' (service name in docker-compose)
port=redis_conn.port, # 6379
password=password,
db=0,
decode_responses=True
)
def get_free_port():
"""Find and return a free port."""
import socket
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('0.0.0.0', 0))
return s.getsockname()[1]
def is_port_free(p):
"""Check if a port is free to use."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(('0.0.0.0', p))
return True
except OSError:
return False
def store_account_metadata(account_id, ip, port, proxy=None, health_port=None, container_id=None):
"""Store account metadata in Redis."""
redis_client = get_redis_connection()
try:
# Verify Redis connection
if not redis_client.ping():
raise ConnectionError("Failed to connect to Redis")
# Store main account metadata
mapping = {
"ip": ip,
"port": str(port),
"status": "running",
"start_time": str(time.time())
}
if proxy:
mapping["proxy"] = proxy
if health_port:
mapping["health_port"] = str(health_port)
if container_id:
mapping["container_id"] = container_id
# Use pipeline for atomic operations
with redis_client.pipeline() as pipe:
# Store main metadata
pipe.hset(f"ytdlp:{account_id}", mapping=mapping)
# Set expiration (1 week)
pipe.expire(f"ytdlp:{account_id}", 604800)
# Add to account list
pipe.sadd("ytdlp:accounts", account_id)
# Execute all commands
results = pipe.execute()
# Verify all commands succeeded
if not all(results):
raise RuntimeError(f"Failed to store metadata for {account_id}. Pipeline results: {results}")
# Verify the data was actually stored
stored_data = redis_client.hgetall(f"ytdlp:{account_id}")
if not stored_data:
raise RuntimeError(f"Failed to verify stored data for {account_id}")
logging.info(f"Successfully stored account metadata for {account_id} in Redis: {stored_data}")
return True
except Exception as e:
logging.error(f"Failed to store account metadata for {account_id}: {e}", exc_info=True)
# Attempt cleanup if storage failed
try:
redis_client = get_redis_connection() # Ensure client is available
redis_client.delete(f"ytdlp:{account_id}")
redis_client.srem("ytdlp:accounts", account_id)
except Exception as cleanup_error:
logging.error(f"Failed to cleanup failed storage for {account_id}: {cleanup_error}")
raise
# Removed get_account_metadata function as the service now handles Redis registration checks.
def prepare_and_deploy_service(**context):
"""Prepare deployment and deploy the Docker service."""
# Retrieve account_id, proxy, clients, and other parameters from DAG run configuration (conf)
# Set default values for account_id, proxy, and redis_enabled
account_id = context['dag_run'].conf.get('account_id') or context['params'].get('account_id', 'account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09')
proxy = context['dag_run'].conf.get('proxy') or context['params'].get('proxy', 'socks5://sslocal-rust-1084:1084')
clients = context['dag_run'].conf.get('clients') or context['params'].get('clients', 'ios,android,mweb')
redis_enabled = context['dag_run'].conf.get('redis_enabled', False) # Default to False
host_param = context['dag_run'].conf.get('host') # Host parameter from config
port_param = context['dag_run'].conf.get('port') # Port parameter from config
docker_network = context['dag_run'].conf.get('docker_network') or context['params'].get('docker_network', 'airflow_prod_proxynet')
host_external_ip_env = os.getenv('HOST_EXTERNAL_IP') # Explicit external IP from environment
if not account_id:
raise ValueError("Account ID is missing.")
# --- Port Determination ---
# Assign a free port if not provided, or validate the provided one
if not port_param:
port = get_free_port()
if not is_port_free(port):
raise ValueError(f"Assigned port {port} is already in use")
logging.info(f"No port provided, assigned free port: {port}")
else:
port = int(port_param)
if not is_port_free(port):
raise ValueError(f"Provided port {port} is already in use")
logging.info(f"Using provided port: {port}")
# Determine health port
health_port = port + 1
if not is_port_free(health_port):
raise ValueError(f"Health port {health_port} (derived from port {port}) is already in use")
logging.info(f"Using health port: {health_port}")
# --- Host Determination ---
# host_for_registration: IP/Host for client discovery (Redis/Logs)
# host_for_sensor: Hostname/IP for Airflow HttpSensor health check
host_for_registration = host_param # Start with the parameter value
if redis_enabled:
# If Redis is enabled, registration host should ideally be externally reachable
if not host_for_registration:
host_for_registration = host_external_ip_env # Use external IP from env var if available
if not host_for_registration:
# If no env var, try fetching external IP using requests
try:
logging.info("HOST_EXTERNAL_IP not set. Attempting to fetch external IP from api.ipify.org...")
response = requests.get('https://api.ipify.org', timeout=10) # 10 second timeout
response.raise_for_status() # Raise exception for bad status codes
host_for_registration = response.text.strip()
if not host_for_registration: # Check if response was empty
raise ValueError("Received empty response from api.ipify.org")
logging.info(f"Successfully fetched external IP: {host_for_registration}")
except requests.exceptions.RequestException as e:
logging.warning(f"Failed to fetch external IP: {e}. Falling back to Docker bridge IP.")
# Fallback to default Docker bridge IP if fetching fails
host_for_registration = "172.17.0.1"
logging.warning(f"Defaulting registration host to Docker bridge IP: {host_for_registration}. Ensure clients can reach this IP.")
except Exception as e:
logging.error(f"Unexpected error fetching external IP: {e}. Falling back to Docker bridge IP.")
host_for_registration = "172.17.0.1"
logging.warning(f"Defaulting registration host to Docker bridge IP: {host_for_registration}. Ensure clients can reach this IP.")
else:
logging.info(f"Redis enabled. Using HOST_EXTERNAL_IP environment variable for registration: {host_for_registration}")
else:
logging.info(f"Redis enabled. Using provided host parameter for registration: {host_for_registration}")
else: # Redis disabled
# If Redis is disabled, registration host defaults to 0.0.0.0 if not provided
if not host_for_registration:
host_for_registration = "0.0.0.0"
logging.warning(f"Redis disabled and no host param provided. Defaulting registration host to {host_for_registration}.")
else:
logging.info(f"Redis disabled. Using provided host parameter for registration: {host_for_registration}")
# host_for_sensor determination will happen *after* container creation, using container name.
logging.info(f"Preparing deployment for account {account_id}. Registration Host: {host_for_registration}, Port: {port}, Health Port: {health_port}")
# Generate unique work ID and context directory
work_id = str(uuid.uuid4())
context['task_instance'].xcom_push(key='work_id', value=work_id)
context_dir = os.path.join(os.getenv('AIRFLOW_HOME', '/tmp'), 'service-data', work_id, 'context-data')
os.makedirs(context_dir, exist_ok=True, mode=0o777)
os.chmod(context_dir, 0o777)
# Push context directory and account details to XCom
context['task_instance'].xcom_push(key='context_dir', value=context_dir)
context['task_instance'].xcom_push(key='account_id', value=account_id)
# Deploy the Docker service
# The 'host_for_registration' variable here represents the externally accessible IP for registration/XCom.
# The service inside the container will listen on 0.0.0.0.
logging.info(f"Deploying service for account {account_id}. Registration Host: {host_for_registration}, Port: {port}")
# Get Redis connection details ONLY if redis_enabled (for the container to register itself)
redis_host_for_container = ''
redis_port_for_container = ''
redis_password_for_container = ''
if redis_enabled:
try:
# Get connection details to pass to the container environment
redis_conn_details = get_redis_connection().connection_pool.connection_kwargs
redis_host_for_container = os.getenv('REDIS_HOST', redis_conn_details.get('host', 'redis'))
redis_port_for_container = str(os.getenv('REDIS_PORT', redis_conn_details.get('port', 6379)))
redis_password_for_container = os.getenv('REDIS_PASSWORD', redis_conn_details.get('password', ''))
logging.info(f"Redis enabled. Passing REDIS_HOST={redis_host_for_container}, REDIS_PORT={redis_port_for_container} to container.")
except Exception as e:
logging.error(f"Failed to get Redis connection details for container environment: {e}")
logging.warning("Proceeding without Redis details in container environment due to error.")
# Depending on container requirements, you might want to raise an error here instead
else:
logging.info("Redis disabled. Not passing REDIS_HOST/REDIS_PORT to container environment.")
# Get Docker connection details from Airflow
try:
secrets_backend = conf.get('secrets', 'backend', fallback='None')
logging.info(f"Attempting to get 'docker_hub' connection. Configured secrets backend: {secrets_backend}")
docker_conn = BaseHook.get_connection("docker_hub")
docker_username = docker_conn.login
docker_password = docker_conn.password
logging.info("Successfully retrieved 'docker_hub' connection.")
except Exception as e:
logging.error(f"Failed to retrieve 'docker_hub' connection: {e}")
# Log details about potential secrets backend issues
secrets_backend_kwargs = conf.get('secrets', 'backend_kwargs', fallback='{}')
logging.error(f"Secrets backend details: backend={secrets_backend}, kwargs={secrets_backend_kwargs}")
# Re-raise the exception to fail the task
raise
try:
# Initialize Docker client to connect to docker-socket-proxy
client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
# Authenticate with Docker Hub
client.login(
username=docker_username,
password=docker_password,
registry=docker_conn.host # Typically "https://index.docker.io/v1/"
)
# Generate a unique container name
container_name = f"ytdlp_service_{account_id}_{uuid.uuid4().hex[:8]}"
# Pull the Docker image (if not already present)
client.images.pull('pangramia/ytdlp-ops-server:latest')
# Use the configured network name (from params or default)
network_name = docker_network # Use the retrieved parameter
logging.info(f"Attempting to run container on network: {network_name}")
# Determine if --probe flag should be added based on DAG param
exit_on_proxy_fail = context['dag_run'].conf.get('exit_on_proxy_fail', True) # Default to True if not set
command_args = [
'--script-dir', '/app/scripts',
'--context-dir', '/app/context-data', # Use the bind mount target inside container
'--port', str(port),
'--health-port', str(health_port),
'--clients', clients,
'--timeout', '120',
'--proxy', proxy if proxy else '',
'--server-identity', account_id, # Use account_id as server identity
]
if redis_enabled:
command_args.extend(['--redis-host', redis_host_for_container])
command_args.extend(['--redis-port', redis_port_for_container])
if exit_on_proxy_fail:
command_args.append('--probe')
logging.info("Adding --probe flag to container command as exit_on_proxy_fail=True")
else:
logging.info("Not adding --probe flag to container command as exit_on_proxy_fail=False")
# Run the Docker container with health port
container = client.containers.run(
image='pangramia/ytdlp-ops-server:latest',
command=command_args, # Use the constructed command list
environment={
'PYTHONUNBUFFERED': '1', # Ensure logs are not buffered
'SERVER_PORT': str(port), # Port the service listens on *inside* the container
'SERVER_HOST': '0.0.0.0', # Service should listen on all interfaces *inside* the container
'ACCOUNT_ID': account_id,
# Pass Redis details *if enabled* for the service to register itself
'REDIS_HOST': redis_host_for_container,
'REDIS_PORT': redis_port_for_container,
'REDIS_PASSWORD': redis_password_for_container,
# Pass PROXY_URL for health check access
'PROXY_URL': proxy if proxy else '',
},
ports={
f"{port}/tcp": port,
f"{health_port}/tcp": health_port
},
volumes={
context_dir: {'bind': '/app/context-data', 'mode': 'rw'}
},
network_mode=network_name, # Use the specified network variable
auto_remove=False, # Do not auto-remove the container
name=container_name, # Use a unique name
detach=True,
tty=True,
shm_size='256m',
# Updated healthcheck to test external connectivity via proxy
healthcheck={
# Use CMD-SHELL to allow conditional logic based on PROXY_URL env var
'test': [
'CMD-SHELL',
# Script checks if PROXY_URL is set, uses it with curl if yes, otherwise curls directly.
# -f: Fail silently (exit non-zero on error)
# --connect-timeout 10: Timeout for connection phase
# > /dev/null: Discard output, we only care about exit code
'if [ -n "$PROXY_URL" ]; then '
'curl -f --connect-timeout 10 -x "$PROXY_URL" https://ifconfig.co > /dev/null; '
'else '
'curl -f --connect-timeout 10 https://ifconfig.co > /dev/null; '
'fi'
],
'interval': 30 * 1000000000, # Check every 30 seconds (30 * 1e9 nanoseconds)
'timeout': 15 * 1000000000, # Timeout after 15 seconds (15 * 1e9 nanoseconds)
'retries': 5, # Retry 5 times on failure
'start_period': 15 * 1000000000 # Grace period of 15 seconds after start
},
# Add labels for better identification
labels={
'service': 'ytdlp',
'account_id': account_id
}
)
# Wait for container to be running (skip health check verification)
start_time = time.time()
while True:
container.reload()
if container.status == 'running':
break
if time.time() - start_time > 10: # 10 second timeout
raise TimeoutError("Container failed to start within 10 seconds")
time.sleep(1)
logging.info(f"Container started: {container.id} (health check verification skipped)")
# Push container details immediately after creation using simplified keys
context['task_instance'].xcom_push(key='container_id', value=container.id)
context['task_instance'].xcom_push(key='container_name', value=container_name)
logging.info(f"Pushed container_id={container.id} and container_name={container_name} to XCom.")
# --- Determine Host for Sensor ---
# Get the container's IP address on the specified network for the HttpSensor
try:
container.reload() # Refresh container attributes
network_settings = container.attrs.get('NetworkSettings', {}).get('Networks', {})
if network_name in network_settings:
host_for_sensor = network_settings[network_name].get('IPAddress')
if not host_for_sensor:
raise ValueError(f"Container {container.id} has no IPAddress on network '{network_name}'")
logging.info(f"Using container IP '{host_for_sensor}' on network '{network_name}' for HttpSensor.")
else:
# Fallback or error if container not on expected network
logging.error(f"Container {container.id} is not attached to the expected network '{network_name}'. Network settings: {network_settings}")
# Option 1: Fallback to container name (might fail as observed)
# host_for_sensor = container_name
# logging.warning(f"Falling back to container name '{host_for_sensor}' for sensor.")
# Option 2: Raise error
raise ValueError(f"Container {container.id} not found on network '{network_name}'. Cannot determine IP for sensor.")
except Exception as e:
logging.error(f"Failed to get container IP address: {e}", exc_info=True)
raise AirflowException(f"Failed to determine IP address for HttpSensor: {e}")
# Ensure we don't use 0.0.0.0 or empty string for the sensor
if not host_for_sensor or host_for_sensor == "0.0.0.0":
raise ValueError(f"Determined host_for_sensor is invalid ('{host_for_sensor}'). Check container network attachment and IP assignment.")
# --- Add extra logging before pushing ---
logging.info(f"FINAL CHECK before XCom push:")
logging.info(f" Account ID: {account_id}")
logging.info(f" Host for Sensor (IP Address): {host_for_sensor}")
logging.info(f" Host for Registration: {host_for_registration}")
logging.info(f" Service Port: {port}")
logging.info(f" Health Port: {health_port}")
logging.info(f" Pushing to XCom key: service_host with value: {host_for_sensor}")
# --- End extra logging ---
# Push distinct service connection details using simplified keys
context['task_instance'].xcom_push(key='service_host_registration', value=host_for_registration) # For client discovery (e.g., Redis)
context['task_instance'].xcom_push(key='service_host', value=host_for_sensor) # IP Address for HttpSensor
context['task_instance'].xcom_push(key='service_port', value=port) # Port is the same
context['task_instance'].xcom_push(key='service_health_port', value=health_port) # Health port is the same
logging.info(f"Pushed host_for_sensor (IP Address)={host_for_sensor} to XCom key 'service_host'")
logging.info(f"Pushed host_for_registration={host_for_registration} to XCom key 'service_host_registration'")
# Store account metadata in Redis only if redis_enabled is True
# This uses the 'host_for_registration' for client discovery
if redis_enabled:
store_account_metadata(account_id, host_for_registration, port, proxy, health_port, container.id)
# If we reach here, deployment is considered successful for now
logging.info("Deployment preparation successful.")
# Return values are implicitly pushed to XCom (but we pushed explicitly above)
return context_dir, host_for_registration, port
except Exception as e:
logging.error(f"Error during service deployment: {e}", exc_info=True)
# Attempt to cleanup the container if it was created before the error
try:
if 'container' in locals() and container and container.id:
logging.warning(f"Attempting to stop and remove container {container.id} due to deployment error.")
container.stop(timeout=5)
container.remove(force=True)
logging.info(f"Successfully stopped and removed container {container.id} after error.")
elif 'container_name' in locals() and container_name:
# Try finding by name if ID wasn't captured
containers = client.containers.list(filters={'name': container_name})
if containers:
logging.warning(f"Attempting to stop and remove container {containers[0].name} by name due to deployment error.")
containers[0].stop(timeout=5)
containers[0].remove(force=True)
logging.info(f"Successfully stopped and removed container {containers[0].name} after error.")
except Exception as cleanup_err:
logging.error(f"Failed during post-error container cleanup: {cleanup_err}")
raise # Re-raise the original exception to fail the task
# Removed the old monitor_health PythonOperator
# stop_service and cleanup_service are now defined directly in the DAG below.
def check_service_health(ti=None, **context):
"""
Periodically checks the service's /health endpoint using requests.
Acts as a long-running sentinel task. Fails if the health check fails
repeatedly or times out.
"""
# Get parameters from XCom
host_reg = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_host_registration')
host_svc = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_host')
health_port = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_health_port')
# Determine the host to use (prioritize registration host)
host = host_reg if host_reg and host_reg != '0.0.0.0' else host_svc
if not host or not health_port:
raise AirflowException("Could not retrieve host or health_port from XCom for health check.")
health_url = f"http://{host}:{health_port}/health"
logging.info(f"Starting health check for: {health_url}")
# Get configuration for polling
# Use task's execution_timeout if available, otherwise default to 1 year
task_timeout = ti.task.execution_timeout or timedelta(days=365)
poke_interval = 60 # Check every 60 seconds (adjust as needed)
start_time = time.monotonic()
timeout_seconds = task_timeout.total_seconds()
consecutive_error_start_time = None # Track start time of consecutive connection errors
error_retry_window = 10 # Seconds to retry connection errors before failing
while True:
current_time = time.monotonic()
if current_time - start_time > timeout_seconds:
raise AirflowException(f"Health check timed out after {timeout_seconds} seconds for {health_url}")
try:
# Use a reasonable timeout for the individual request
response = requests.get(health_url, timeout=15) # 15 second request timeout
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
# Check response content if needed (optional)
# Example: Check for specific JSON content
# try:
# data = response.json()
# if data.get("status") == "healthy":
# logging.info(f"Health check successful: Status {response.status_code}")
# else:
# logging.warning(f"Health check OK (Status {response.status_code}), but content unexpected: {data}")
# except requests.exceptions.JSONDecodeError:
# logging.warning(f"Health check OK (Status {response.status_code}), but response is not valid JSON.")
# If we got a 2xx status, log success and reset error timer if needed
if consecutive_error_start_time is not None:
logging.info(f"Connection to {health_url} recovered.")
consecutive_error_start_time = None
logging.info(f"Health check successful: Status {response.status_code} for {health_url}")
except requests.exceptions.Timeout:
current_monotonic_time = time.monotonic()
if consecutive_error_start_time is None:
consecutive_error_start_time = current_monotonic_time
logging.warning(f"Health check request timed out for {health_url}. Starting {error_retry_window}s retry window...")
else:
elapsed_error_time = current_monotonic_time - consecutive_error_start_time
if elapsed_error_time > error_retry_window:
error_msg = f"Health check failed for {health_url}: Timeout persisted for over {error_retry_window} seconds."
logging.error(error_msg)
raise AirflowException(error_msg)
else:
logging.warning(f"Health check request timed out for {health_url}. Retrying within {error_retry_window}s window ({elapsed_error_time:.1f}s elapsed)...")
except requests.exceptions.ConnectionError as e:
# Check if the error is specifically "Connection refused" - fail immediately
if "[Errno 111] Connection refused" in str(e):
logging.error(f"Health check failed for {health_url}: Connection refused. Failing task immediately.")
raise AirflowException(f"Health check failed for {health_url}: Connection refused")
else:
# Handle other connection errors with the retry window
current_monotonic_time = time.monotonic()
if consecutive_error_start_time is None:
consecutive_error_start_time = current_monotonic_time
logging.warning(f"Health check connection error for {health_url}: {e}. Starting {error_retry_window}s retry window...")
else:
elapsed_error_time = current_monotonic_time - consecutive_error_start_time
if elapsed_error_time > error_retry_window:
error_msg = f"Health check failed for {health_url}: Connection error persisted for over {error_retry_window} seconds. Last error: {e}"
logging.error(error_msg)
raise AirflowException(error_msg)
else:
logging.warning(f"Health check connection error for {health_url}: {e}. Retrying within {error_retry_window}s window ({elapsed_error_time:.1f}s elapsed)...")
except requests.exceptions.HTTPError as e:
# This catches 4xx/5xx errors - fail immediately
logging.error(f"Health check failed for {health_url}: Status {e.response.status_code}. Failing task.")
# Fail the task immediately on HTTP error
raise AirflowException(f"Health check failed for {health_url}: Status {e.response.status_code}")
except requests.exceptions.RequestException as e:
logging.error(f"Health check failed for {health_url} with unexpected error: {e}. Failing task.")
# Fail the task immediately on other request errors
raise AirflowException(f"Health check failed for {health_url}: {e}")
except Exception as e:
# Catch any other unexpected errors during the check
logging.error(f"Unexpected error during health check for {health_url}: {e}", exc_info=True)
raise AirflowException(f"Unexpected error during health check: {e}")
# Wait for the poke interval before the next check
time.sleep(poke_interval)
def _wait_forever():
"""Sleeps indefinitely (or until task timeout) to simulate a running service."""
logging.info("Sentinel task started. Sleeping in a loop...")
# Sleep in a loop with a reasonable interval to avoid OverflowError
# The task will keep running until it times out based on execution_timeout
# or is manually stopped/failed.
while True:
try:
# Sleep for a long interval (e.g., 1 day)
# You can adjust this interval if needed.
time.sleep(86400) # Sleep for 24 hours
except KeyboardInterrupt:
logging.info("Sentinel task interrupted. Exiting.")
break
except Exception as e:
# Log other potential errors during sleep, though unlikely
logging.error(f"Error during sentinel sleep loop: {e}")
# Optionally break or continue based on error handling strategy
break # Exit loop on unexpected error
def stop_service(**context):
"""Stop the running Docker container with verification."""
# Retrieve account_id from params or kwargs
account_id = context.get('params', {}).get('account_id') or context.get('account_id')
if not account_id:
raise ValueError("Account ID is missing.")
# Initialize Docker client to connect to docker-socket-proxy
client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
try:
# For testing, try to get container ID from environment if XCom is not available
container_id = None
if 'ti' in context:
# Use simplified XCom key
container_id = context['ti'].xcom_pull(task_ids='prepare_and_deploy', key='container_id')
if not container_id:
# If not found in XCom, try to find container by account_id pattern (keep this fallback)
containers = client.containers.list(filters={"name": f"ytdlp_service_{account_id}"})
if containers:
container = containers[0]
container_id = container.id
logging.info(f"Found container by name pattern: {container.name} (ID: {container_id})")
else:
logging.warning(f"No container found for account {account_id} - nothing to stop")
return
if container_id:
# If found in XCom, stop by container ID
container = client.containers.get(container_id)
# Verify container is running before stopping
if container.status != 'running':
logging.warning(f"Container {container_id} is not running (status: {container.status})")
return
logging.info(f"Stopping container {container_id}...")
container.stop(timeout=10) # 10 second timeout
# Verify container is stopped
container.reload()
if container.status == 'exited':
logging.info(f"Successfully stopped container {container_id}")
else:
logging.error(f"Container {container_id} failed to stop (status: {container.status})")
raise RuntimeError(f"Container {container_id} failed to stop")
# Clear Redis entries only if redis_enabled is True
# Retrieve redis_enabled status from DAG run conf or params
redis_enabled = context['dag_run'].conf.get('redis_enabled', False) or context['params'].get('redis_enabled', False)
if redis_enabled:
redis_client = get_redis_connection()
try:
# Verify Redis connection
if not redis_client.ping():
raise ConnectionError("Failed to connect to Redis")
# Remove main metadata
redis_client.delete(f"ytdlp:{account_id}")
# Remove from accounts set
redis_client.srem("ytdlp:accounts", account_id)
logging.info(f"Successfully cleared Redis entries for account: {account_id}")
except Exception as e:
logging.error(f"Failed to clear Redis entries for account {account_id}: {e}")
# Do not raise here, allow container stop to be considered successful
# raise # Optional: re-raise if Redis cleanup failure should fail the task
return
logging.warning(f"No container found for account {account_id} - nothing to stop")
except docker.errors.NotFound as e:
logging.warning(f"Container for account {account_id} not found: {e}")
except Exception as e:
logging.error(f"Failed to stop container: {e}")
raise
def cleanup_service(**context):
"""Cleanup service resources including Redis entries and XCom data."""
# Note: This function is now called within the manual_stop_cleanup TaskGroup
try:
# Retrieve account_id from params first, then from XCom
account_id = context['params'].get('account_id')
if not account_id:
# Try to get it from XCom
account_id = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='account_id')
if not account_id:
logging.warning("Account ID not found in params or XCom - skipping resource cleanup")
return
# Redis cleanup (if redis_enabled=True) is handled in the 'stop_service' task.
logging.info(f"Redis cleanup for account {account_id} is handled by the 'stop_service' task if enabled.")
# Cleanup XCom data (using simplified keys where applicable)
# Note: XCom cleanup is generally not strictly necessary but can be good practice.
# Airflow manages XCom expiry. This code doesn't actually *delete* XComs.
# To truly delete, you'd use the Airflow API or DB directly.
# We'll leave the pull calls here as they don't harm anything.
ti = context['task_instance']
ti.xcom_pull(key='container_id', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='container_name', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='service_host_registration', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='service_host', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='service_port', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='service_health_port', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='work_id', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='context_dir', task_ids='prepare_and_deploy', include_prior_dates=True)
ti.xcom_pull(key='account_id', task_ids='prepare_and_deploy', include_prior_dates=True) # Keep account_id pull
logging.info(f"Pulled XCom data for potential cleanup logging for account: {account_id}")
# Initialize Docker client
client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
container_found_and_removed = False
# Attempt 1: Get container ID from XCom using simplified key
container_id_xcom = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='container_id')
if container_id_xcom:
logging.info(f"Attempting to remove container using XCom ID: {container_id_xcom}")
try:
container = client.containers.get(container_id_xcom)
logging.info(f"Found container {container.id} (Name: {container.name}). Removing...")
container.remove(force=True)
logging.info(f"Successfully removed container {container.id}")
container_found_and_removed = True
except docker.errors.NotFound:
logging.warning(f"Container with XCom ID {container_id_xcom} not found. Trying other methods.")
except Exception as e:
logging.error(f"Error removing container {container_id_xcom}: {e}")
# Attempt 2: Find container by labels if not found/removed via XCom ID
if not container_found_and_removed:
logging.info(f"Attempting to find and remove container by labels: service=ytdlp, account_id={account_id}")
try:
containers = client.containers.list(
filters={'label': [f'service=ytdlp', f'account_id={account_id}']},
all=True # Include stopped containers
)
if containers:
for container in containers:
logging.info(f"Found container {container.id} (Name: {container.name}) by labels. Removing...")
try:
container.remove(force=True)
logging.info(f"Successfully removed container {container.id}")
container_found_and_removed = True # Mark as found even if only one is removed
except Exception as e:
logging.error(f"Error removing container {container.id} found by labels: {e}")
else:
logging.info("No containers found matching labels.")
except Exception as e:
logging.error(f"Error searching for containers by labels: {e}")
# Attempt 3: Find container by name pattern if still not found/removed
if not container_found_and_removed:
container_name_pattern = f"ytdlp_service_{account_id}_*"
logging.info(f"Attempting to find and remove container by name pattern: {container_name_pattern}")
try:
containers = client.containers.list(filters={'name': container_name_pattern}, all=True)
if containers:
for container in containers:
logging.info(f"Found container {container.id} (Name: {container.name}) by name pattern. Removing...")
try:
container.remove(force=True)
logging.info(f"Successfully removed container {container.id}")
container_found_and_removed = True
except Exception as e:
logging.error(f"Error removing container {container.id} found by name: {e}")
else:
logging.info("No containers found matching name pattern.")
except Exception as e:
logging.error(f"Error searching for containers by name: {e}")
if not container_found_and_removed:
logging.warning(f"Could not find or remove any container for account {account_id} using ID, labels, or name.")
# Get context directory from XCom and remove it
context_dir = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='context_dir')
if context_dir and os.path.exists(context_dir):
shutil.rmtree(context_dir)
logging.info(f"Cleaned up working directory: {context_dir}")
except Exception as e:
logging.error(f"Error during cleanup: {e}")
raise
# Define the DAG
with DAG(
'ytdlp_service',
default_args=default_args,
description='Deploy YTDLP token service for ios, android, mweb',
schedule_interval=None,
start_date=days_ago(1), # Use dynamic start date for manually triggered DAG
catchup=False,
tags=['youtube', 'tokens', 'service', 'docker'],
# executor_config moved to default_args
is_paused_upon_creation=False,
params={
'account_id': Param(
'account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09',
type="string",
description="Required: The account ID for which the service is being deployed."
),
'proxy': Param(
'socks5://sslocal-rust-1084:1084',
type=["null", "string"],
description="Optional: The SOCKS5 proxy URL to use for the service (e.g., socks5://host:port)."
),
'clients': Param(
'ios,android,mweb',
type="string",
description="Comma-separated list of client types (e.g., ios,android,mweb)."
),
'redis_enabled': Param(
False,
type="boolean",
description="Use Redis for service discovery? If False, host/port must be provided or will be auto-assigned."
),
'host': Param(
None,
type=["null", "string"],
description="Optional: Host IP for the service. If redis_enabled=False and host is not provided, defaults to '0.0.0.0'. If redis_enabled=True and host is not provided, uses HOST_EXTERNAL_IP or defaults to '0.0.0.0'."
),
'port': Param(
None,
type=["null", "integer"],
description="Optional: Port for the service. If None, a free port will be assigned automatically. If redis_enabled=False and a port is provided, it will be used (after checking availability)."
),
# redis_host and redis_port parameters are removed.
# If redis_enabled=True, the DAG will use the 'redis_default' Airflow connection.
'docker_network': Param(
'airflow_prod_proxynet',
type="string",
description="Optional: The Docker network to attach the container to. Defaults to 'airflow_prod_proxynet'."
),
'exit_on_proxy_fail': Param(
True,
type="boolean",
description="Exit the service container immediately if the initial proxy test fails?"
),
}
) as dag:
# Task to prepare and deploy the service
prepare_and_deploy = PythonOperator(
task_id='prepare_and_deploy',
python_callable=prepare_and_deploy_service,
provide_context=True,
trigger_rule='all_success' # Keep default trigger rule for prepare_and_deploy
)
# Combined Health Check and Sentinel Task using PythonOperator
# This task runs for a long time, checking health periodically using the 'requests' library.
# If the health check fails repeatedly or times out, the task fails, triggering 'stop_service'.
monitor_service_health = PythonOperator(
task_id='monitor_service_health',
python_callable=check_service_health,
provide_context=True,
# Set execution timeout for the task itself (acts as the overall timeout)
execution_timeout=timedelta(days=365), # Long timeout (e.g., 1 year)
# op_kwargs can pass static config, but host/port come from XCom inside the function
# poke_interval and request timeout are handled within check_service_health
)
monitor_service_health.doc_md = """
### Monitor Service Health Task (PythonOperator)
Uses a Python function to periodically check the service's `/health` endpoint using the `requests` library.
Acts as both a health check and a sentinel for the running service.
- **Pulls from XCom:** Reads `service_host_registration`, `service_host`, and `service_health_port` from the `prepare_and_deploy` task to construct the target URL.
- **Polling:** Checks the `/health` endpoint every 60 seconds.
- **Timeout:** Uses the task's `execution_timeout` (set to 1 year) as the overall maximum duration. Individual requests have a 15-second timeout.
- **Failure:** If a health check request returns a 4xx/5xx status code or encounters other request errors, the task fails immediately. If the overall `execution_timeout` is reached without a failure, the task would eventually time out and fail.
"""
# Task to stop the service (runs if monitor_service_health fails)
stop = PythonOperator(
task_id='stop_service',
python_callable=stop_service,
provide_context=True,
trigger_rule=TriggerRule.ONE_FAILED # Run only if monitor_service_health fails
)
stop.doc_md = """
### Stop Service Task
Stops the Docker container associated with the service.
- **Trigger Rule:** `one_failed` - This task only runs if the upstream `monitor_service_health` task fails.
- Pulls container ID/name from XCom or finds it using labels/name patterns.
- Clears Redis entries if `redis_enabled=True`.
"""
# Marker task to indicate that the deployment failed
prepare_failed_marker = EmptyOperator(
task_id='prepare_failed_marker',
trigger_rule=TriggerRule.ONE_FAILED # Run only if 'prepare_and_deploy' fails
)
# Task to cleanup resources (runs after stop sequence OR if prepare fails)
cleanup = PythonOperator(
task_id='cleanup_service',
python_callable=cleanup_service,
provide_context=True,
trigger_rule=TriggerRule.ALL_DONE # Run after upstream (stop or prepare_failed_marker) is done
)
cleanup.doc_md = """
### Cleanup Service Task
Removes the Docker container and cleans up related resources.
- **Trigger Rule:** `all_done` - Runs after the `stop_service` task finishes, whether it succeeded or failed.
- Removes the container using ID from XCom, labels, or name patterns.
- Cleans up XCom variables.
- Removes the context directory.
"""
# Define task dependencies
# Success Path: prepare -> monitor (runs indefinitely)
# Monitor Failure Path: monitor (fails) -> stop -> cleanup
# Prepare Failure Path: prepare (fails) -> prepare_failed_marker -> cleanup
prepare_and_deploy >> monitor_service_health
prepare_and_deploy >> prepare_failed_marker # Trigger marker if prepare fails
monitor_service_health >> stop # Trigger stop if monitor fails
# Cleanup is triggered after stop finishes OR after prepare_failed_marker finishes
stop >> cleanup
prepare_failed_marker >> cleanup

BIN
airflow/dags/.DS_Store vendored

Binary file not shown.

View File

@ -1,88 +0,0 @@
# Архитектура и описание YTDLP Airflow DAGs
Этот документ описывает архитектуру и назначение DAG'ов, используемых для скачивания видео с YouTube. Система построена на модели непрерывного, самоподдерживающегося цикла для параллельной и отказоустойчивой обработки.
## Основной цикл обработки
Обработка выполняется двумя основными DAG'ами, которые работают в паре: оркестратор и воркер.
### `ytdlp_ops_orchestrator` (Система "зажигания")
- **Назначение:** Этот DAG действует как "система зажигания" для запуска обработки. Он запускается вручную для старта указанного количества параллельных циклов-воркеров.
- **Принцип работы:**
- Он **не** обрабатывает URL-адреса самостоятельно.
- Его единственная задача — запустить сконфигурированное количество DAG'ов `ytdlp_ops_worker_per_url`.
- Он передает всю необходимую конфигурацию (пул аккаунтов, подключение к Redis и т.д.) воркерам.
### `ytdlp_ops_worker_per_url` (Самоподдерживающийся воркер)
- **Назначение:** Этот DAG обрабатывает один URL и спроектирован для работы в непрерывном цикле.
- **Принцип работы:**
1. **Запуск:** Начальный запуск инициируется `ytdlp_ops_orchestrator`.
2. **Получение задачи:** Воркер извлекает один URL из очереди `_inbox` в Redis. Если очередь пуста, выполнение воркера завершается, и его "линия" обработки останавливается.
3. **Обработка:** Он взаимодействует с сервисом `ytdlp-ops-server` для получения `info.json` и прокси, после чего скачивает видео.
4. **Продолжение или остановка:**
- **В случае успеха:** Он запускает новый экземпляр самого себя, создавая непрерывный цикл для обработки следующего URL.
- **В случае сбоя:** Цикл прерывается (если `stop_on_failure` установлено в `True`), останавливая эту "линию" обработки. Это предотвращает остановку всей системы из-за одного проблемного URL или аккаунта.
## Управляющие DAG'и
### `ytdlp_mgmt_proxy_account`
- **Назначение:** Это основной инструмент для мониторинга и управления состоянием ресурсов, используемых `ytdlp-ops-server`.
- **Функциональность:**
- **Просмотр статусов:** Позволяет увидеть текущий статус всех прокси и аккаунтов (например, `ACTIVE`, `BANNED`, `RESTING`).
- **Управление прокси:** Позволяет вручную банить, разбанивать или сбрасывать статус прокси.
- **Управление аккаунтами:** Позволяет вручную банить или разбанивать аккаунты.
### `ytdlp_mgmt_queues`
- **Назначение:** Предоставляет набор инструментов для управления очередями Redis, используемыми в конвейере обработки.
- **Функциональность (через параметр `action`):**
- `add_videos`: Добавление одного или нескольких URL-адресов YouTube в очередь.
- `clear_queue`: Очистка (удаление) указанного ключа Redis.
- `list_contents`: Просмотр содержимого ключа Redis (списка или хэша).
- `check_status`: Проверка общего состояния очередей (тип, размер).
- `requeue_failed`: Перемещение всех URL-адресов из очереди сбоев `_fail` обратно в очередь `_inbox` для повторной обработки.
## Стратегия управления ресурсами (Прокси и Аккаунты)
Система использует интеллектуальную стратегию для управления жизненным циклом и состоянием аккаунтов и прокси, чтобы максимизировать процент успеха и минимизировать блокировки.
- **Жизненный цикл аккаунта ("Cooldown"):**
- Чтобы предотвратить "выгорание", аккаунты автоматически переходят в состояние "отдыха" (`RESTING`) после периода интенсивного использования.
- По истечении периода отдыха они автоматически возвращаются в `ACTIVE` и снова становятся доступными для воркеров.
- **Умная стратегия банов:**
- **Сначала бан аккаунта:** При возникновении серьезной ошибки (например, `BOT_DETECTED`) система наказывает **только аккаунт**, который вызвал сбой. Прокси при этом продолжает работать.
- **Бан прокси по "скользящему окну":** Прокси банится автоматически, только если он демонстрирует **систематические сбои с РАЗНЫМИ аккаунтами** за короткий промежуток времени. Это является надежным индикатором того, что проблема именно в прокси.
- **Мониторинг:**
- DAG `ytdlp_mgmt_proxy_account` является основным инструментом для мониторинга. Он показывает текущий статус всех ресурсов, включая время, оставшееся до активации забаненных или отдыхающих аккаунтов.
- Граф выполнения DAG `ytdlp_ops_worker_per_url` теперь явно показывает шаги, такие как `assign_account`, `get_token`, `ban_account`, `retry_get_token`, что делает процесс отладки более наглядным.
## Внешние сервисы
### `ytdlp-ops-server` (Thrift Service)
- **Назначение:** Внешний сервис, который предоставляет аутентификационные данные (токены, cookies, proxy) для скачивания видео.
- **Взаимодействие:** Worker DAG (`ytdlp_ops_worker_per_url`) обращается к этому сервису перед началом загрузки для получения необходимых данных для `yt-dlp`.
## Логика работы Worker DAG (`ytdlp_ops_worker_per_url`)
Этот DAG является "рабочей лошадкой" системы. Он спроектирован как самоподдерживающийся цикл для обработки одного URL за запуск.
### Задачи и их назначение:
- **`pull_url_from_redis`**: Извлекает один URL из очереди `_inbox` в Redis. Если очередь пуста, DAG завершается со статусом `skipped`, останавливая эту "линию" обработки.
- **`assign_account`**: Выбирает аккаунт для выполнения задачи. Он будет повторно использовать тот же аккаунт, который был успешно использован в предыдущем запуске в своей "линии" (привязка аккаунта). Если это первый запуск, он выбирает случайный аккаунт.
- **`get_token`**: Основная задача. Она обращается к `ytdlp-ops-server` для получения `info.json`.
- **`handle_bannable_error_branch`**: Если `get_token` завершается с ошибкой, требующей бана, эта задача-развилка решает, что делать дальше, в зависимости от политики `on_bannable_failure`.
- **`ban_account_and_prepare_for_retry`**: Если политика разрешает повтор, эта задача банит сбойный аккаунт и выбирает новый для повторной попытки.
- **`retry_get_token`**: Выполняет вторую попытку получить токен с новым аккаунтом.
- **`ban_second_account_and_proxy`**: Если и вторая попытка неудачна, эта задача банит второй аккаунт и использованный прокси.
- **`download_and_probe`**: Если `get_token` (или `retry_get_token`) завершилась успешно, эта задача использует `yt-dlp` для скачивания медиа и `ffmpeg` для проверки целостности скачанного файла.
- **`mark_url_as_success`**: Если `download_and_probe` завершилась успешно, эта задача записывает результат в хэш `_result` в Redis.
- **`handle_generic_failure`**: Если любая из основных задач завершается с неисправимой ошибкой, эта задача записывает подробную информацию об ошибке в хэш `_fail` в Redis.
- **`decide_what_to_do_next`**: Задача-развилка, которая запускается после успеха или неудачи. Она решает, продолжать ли цикл.
- **`trigger_self_run`**: Задача, которая фактически запускает следующий экземпляр DAG, создавая непрерывный цикл.

View File

@ -1,23 +0,0 @@
import socket
import logging
logger = logging.getLogger(__name__)
def get_ip_address():
"""
Get the primary IP address of the host.
This is used by Airflow workers to advertise their IP for log serving,
ensuring the webserver can reach them in a multi-host environment.
"""
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
try:
# This doesn't even have to be reachable
s.connect(('10.255.255.255', 1))
ip_address = s.getsockname()[0]
logger.info(f"Determined host IP address as: {ip_address}")
except Exception as e:
logger.warning(f"Could not determine IP address, falling back to 127.0.0.1. Error: {e}")
ip_address = '127.0.0.1'
finally:
s.close()
return ip_address

View File

@ -1,56 +0,0 @@
from airflow.plugins_manager import AirflowPlugin
from airflow.hooks.base import BaseHook
from airflow.configuration import conf
import uuid
import backoff
class YTDLPHook(BaseHook):
def __init__(self, conn_id='ytdlp_default'):
super().__init__()
self.conn_id = conn_id
self.connection = self.get_connection(conn_id)
self.timeout = conf.getint('ytdlp', 'timeout', fallback=120)
self.max_retries = conf.getint('ytdlp', 'max_retries', fallback=3)
@backoff.on_exception(backoff.expo,
Exception,
max_tries=3,
max_time=300)
def start_service(self, host, port, service_id, work_dir):
"""Start token service as a long-running process"""
import subprocess
import os
from pathlib import Path
# Get script path relative to Airflow home
airflow_home = os.getenv('AIRFLOW_HOME', '')
script_path = Path(airflow_home).parent / 'ytdlp_ops_server.py'
# Ensure work directory exists
os.makedirs(work_dir, exist_ok=True)
# Start service process
cmd = [
'python', str(script_path),
'--port', str(port),
'--host', host,
'--service-id', service_id,
'--context-dir', work_dir,
'--script-dir', str(Path(airflow_home) / 'dags' / 'scripts')
]
self.log.info(f"Starting token service: {' '.join(cmd)}")
# Start process detached
docker_cmd = [
'docker-compose', '-f', 'docker-compose.yaml',
'up', '-d', '--build', 'ytdlp-service'
]
subprocess.run(docker_cmd, check=True)
self.log.info(f"Token service started on {host}:{port}")
return True
class YTDLPPlugin(AirflowPlugin):
name = 'ytdlp_plugin'
hooks = [YTDLPHook]

View File

@ -1,14 +0,0 @@
2025-04-06 00:41:03,141 - INFO - Attempting to connect to server at 127.0.0.1:9090...
2025-04-06 00:41:03,141 - INFO - Successfully connected to server
2025-04-06 00:41:03,142 - INFO - Server connection test successful
2025-04-06 00:41:03,142 - INFO - Requesting token for URL: https://www.youtube.com/watch?v=sOlTX9uxUtM%27
2025-04-06 00:41:17,930 - INFO - Successfully received token data from server
2025-04-06 00:41:17,938 - INFO - Valid JSON with video data: Операция "Багратион". От поражения к победе.
2025-04-06 00:41:17,944 - INFO - Successfully saved info.json to info_json_sOlTX9uxUtM_1743889277.json and latest.json to latest.json
2025-04-06 00:44:05,608 - INFO - Attempting to connect to server at 127.0.0.1:9090...
2025-04-06 00:44:05,609 - INFO - Successfully connected to server
2025-04-06 00:44:05,609 - INFO - Server connection test successful
2025-04-06 00:44:05,610 - INFO - Requesting token for URL: https://www.youtube.com/watch?v=sOlTX9uxUtM%27
2025-04-06 00:44:18,350 - INFO - Successfully received token data from server
2025-04-06 00:44:18,357 - INFO - Valid JSON with video data: Операция "Багратион". От поражения к победе.
2025-04-06 00:44:18,364 - INFO - Successfully saved info.json to info_json_sOlTX9uxUtM_1743889458.json and latest.json to latest.json