From 7b13257bd1ca5ce49c5f9a30b384a787d6aac8b1 Mon Sep 17 00:00:00 2001 From: aperez Date: Tue, 26 Aug 2025 19:25:31 +0300 Subject: [PATCH] Remove unused dags --- .aider.chat.history.md | 36 + .aider.input.history | 15 + .aider.tags.cache.v4/cache.db | Bin 0 -> 81920 bytes airflow/.env | 23 - airflow/.env.example | 60 - airflow/dags-backup/old-download-dag.py | 1167 ----------------- airflow/dags-backup/proxy-checker.py | 736 ----------- airflow/dags-backup/ytdlp_client_dag_v2.1.py | 941 ------------- .../ytdlp_mgmt_queue_check_status.py | 179 --- .../dags-backup/ytdlp_ops_worker_per_url.py | 343 ----- .../ytdlp_proc_sequential_processor.py | 707 ---------- airflow/dags-backup/ytdlp_service_dag.py | 974 -------------- airflow/dags/.DS_Store | Bin 6148 -> 0 bytes airflow/dags/README.ru.md | 88 -- .../ytdlp_service_deploy.cpython-312.pyc | Bin 3339 -> 0 bytes .../ytdlp_service_test.cpython-312.pyc | Bin 2269 -> 0 bytes .../ytdlp_token_dag.cpython-312.pyc | Bin 5700 -> 0 bytes airflow/dags/get_ip.py | 23 - airflow/plugins/ytdlp_plugin.py | 56 - airflow/ytdlp_ops_client.log | 14 - 20 files changed, 51 insertions(+), 5311 deletions(-) create mode 100644 .aider.chat.history.md create mode 100644 .aider.input.history create mode 100644 .aider.tags.cache.v4/cache.db delete mode 100644 airflow/.env delete mode 100644 airflow/.env.example delete mode 100644 airflow/dags-backup/old-download-dag.py delete mode 100644 airflow/dags-backup/proxy-checker.py delete mode 100644 airflow/dags-backup/ytdlp_client_dag_v2.1.py delete mode 100644 airflow/dags-backup/ytdlp_mgmt_queue_check_status.py delete mode 100644 airflow/dags-backup/ytdlp_ops_worker_per_url.py delete mode 100644 airflow/dags-backup/ytdlp_proc_sequential_processor.py delete mode 100644 airflow/dags-backup/ytdlp_service_dag.py delete mode 100644 airflow/dags/.DS_Store delete mode 100644 airflow/dags/README.ru.md delete mode 100644 airflow/dags/__pycache__/ytdlp_service_deploy.cpython-312.pyc delete mode 100644 airflow/dags/__pycache__/ytdlp_service_test.cpython-312.pyc delete mode 100644 airflow/dags/__pycache__/ytdlp_token_dag.cpython-312.pyc delete mode 100644 airflow/dags/get_ip.py delete mode 100644 airflow/plugins/ytdlp_plugin.py delete mode 100644 airflow/ytdlp_ops_client.log diff --git a/.aider.chat.history.md b/.aider.chat.history.md new file mode 100644 index 0000000..716fe4b --- /dev/null +++ b/.aider.chat.history.md @@ -0,0 +1,36 @@ + +# aider chat started at 2025-08-19 17:52:04 + +> Newer aider version v0.86.1 is available. +> /Users/aperez/.local/share/uv/tools/aider-chat/bin/python3 -m pip install --upgrade --upgrade-strategy only-if-needed aider-chat +> Run pip install? (Y)es/(N)o [Yes]: y +> Re-run aider to use new version. + +# aider chat started at 2025-08-19 17:55:26 + +> /Users/aperez/.local/bin/aider --model gemini/gemini-2.5-pro-preview-03-25 +> Aider v0.86.1 +> Main model: gemini/gemini-2.5-pro-preview-03-25 with diff-fenced edit format +> Weak model: gemini/gemini-2.0-flash +> Git repo: .git with 34 files +> Repo-map: using 4096 tokens, auto refresh +> https://aider.chat/HISTORY.html#release-notes +> Please answer with one of: yes, no, skip, all +> Would you like to see what's new in this version? (Y)es/(N)o [Yes]: n + +#### ls +> +> +> ^C again to exit +> +> +> ^C again to exit + +#### /read-only ../yt-dlp-deployment/ansible/airflow_worker +> Added 51 files from directory /opt/yt-dlp-dags/../yt-dlp-deployment/ansible/airflow_worker to read-only files. +> +> +> ^C again to exit +> +> +> ^C KeyboardInterrupt diff --git a/.aider.input.history b/.aider.input.history new file mode 100644 index 0000000..bd19aec --- /dev/null +++ b/.aider.input.history @@ -0,0 +1,15 @@ + +# 2025-08-19 17:52:06.094899 ++Y + +# 2025-08-19 17:55:33.616531 ++D + +# 2025-08-19 17:55:35.382770 ++No + +# 2025-08-19 17:55:39.050939 ++ls + +# 2025-08-19 17:56:18.910148 ++/read-only ../yt-dlp-deployment/ansible/airflow_worker diff --git a/.aider.tags.cache.v4/cache.db b/.aider.tags.cache.v4/cache.db new file mode 100644 index 0000000000000000000000000000000000000000..9af2e3c8760312443183a970452c75e3e1f6a1d4 GIT binary patch literal 81920 zcmeHw37A|(wSV_?&pzFEvQGAN!j?d0LP$i^hcZbfVZ6DSFqr@mu<7Z!Gt)_L^peSd zECIm?S5zDX1r%A_o`A9o!UI78l|_6m;D(~0h%1QegZHbubXVOzb$fWe_y2tVcjtrd z^r`LCsZ&*_PMvdV`*B-RB|SEpD`XO-*xEp6prIjfOe_`%1j6`#HvV`1+VBqsV}C99 zKWP5T`y(9axnN~P`E#JX^{#+&n(|O|EIKQ4S7fB)hW1z5-_lkF*#Gk>;8Vb-fKLIR z0zL(N3iuS576tYl)!w}D4P7ngt}Yc)V`F+@yIv}#vSY<~F?G5gAJNl#N#Br2j_Vr+ z`+A4^Vnc)dn>Y6j#;AO;-c3VD+Ss?H4>|4TO#_3mzTOST#s&vYh^_D2+`l!p zW7|ennNvw@d*4uOPa<8`V@KQH-LZi!8+#ZMu@jE%8|;hi)+dp;oI#eqCDylfgD`MVmhc>`E1l}y#6uN4~uQ>JHbMI8ofBG)oTD!*`i)3)gJ&WpWTc8t=szs zhyK9<;Cdk&fORd+3*QLdImkT#$y_;Gnhx{u8UK5^XN|}GtpoUf$CfQIi~M%gD_1u(b!}K2447k9JS~kgeLPVv z<;>r4r=s{8=U?m`&(84HA>KU6Qy-1U(P(~z@UJ#js-68SDkjXRWSPyf%rt-EAO4?D z0iOas1$+wl6!0nFQ^2QyPXV6-J_URV_!RId@J~U3Nc6{n&gdNh<@d_7%3aE*l*^QJ zlyOB<)+vjXQ1qA4JEGr-J{tY2h3~B8Q&2P!1ibcKHtc2#YjHgP)<|zKb?(9LAwpKIWypGTtf+MNo?sz_x z+@03r`9iLgOXkvz^V&meLS{-Pm5sBTd((m{a86)OTWC$IeVbmK%qGVRxooan>}m~l zhiz~slgOLd*~Rz#meAT+cEYY)xsXky| zs?1QHi2gD9LiAPTe&q_~d?lltsPrm-P=1DN{+~|)p8`Gwd32fws{6prt10YzfT?GBtt1{=rVw(N;&D5Sb@Y6LicCt+Dh6f|j72fooL+ zZL>q&4!EKsXzdEEb%4Bjpk+>IwF9r}2Aa8=In6**RL~4GvY%DOKzLT@u%JsR5Q>DB z1}o?PhDc~xYxVqZn(8ahv+7adZDMF=W|f%Oc|k}#Msn_kPxfa=#;VMbt5{Zwir$69CO@5QzQE5RB3A8)Ogd1 z@#gUsq_vLer7cc!!IeDE2qAfDJj$1e8R1lRH0Oe@GMXn6g)AHcT?w7W!cqe2F!(~1 z64|64&y<-~2)d;4TyFQ&cpIx*FZb*VN27s#p~|3~THDn`CRZNK?Oi>YE0@Z{@H`nY zoloM$-Kl(DFLWQ?b7arqJ#bDkN9Ey%j+=q$CFOBkJ}{^O^!*6*LHj83_y7>NXu z(?J5gDU{+#8>gP)xYgKQw>#g`MxTydzx2}gJT@_qDcz?vd%VK9 z>5r)5a_Pq`TrT~XH!gY)hsy_!X^RH*3MwDpBj*8!@R*c^J(y1RlzjNChpl$cdYHBQ zLbEmM;-J5J^a-~-y{6Kx$CO;?G_%_IN432vW7X?HZ%S=5lzgI4G*!ZTC*$QpnwXJi z8#9tnuVz8nlAg)OM^c5NJxQENyW9w85~-{UdZ5vf7{P=njFmG`%c5%cY~IXI_VL;L zoIc-Z%okE9C4%2agFj$2r;4Tq+m-)vqxl`F{3fWjT?tpv6!p~i875~bw03w*FA+C84B8;sUX zX8fd54K?0wL@Z{bS@%MtB{5vgrOPEZH-5xu)AyDNiDW5}cC+PZr zdG@s1Gxc!K)B&S$L?13g3=tS@ha=i%IJ>h;>e4U&qj)ow?;ZTe%5hQ z$LSq|9S61lvHjlmi`!3XKe+9;Z4b9y(l*j|KhER7dZqougMH>u~@EkZG#XKEHy5bF+~QolUT+T}cQFMUHeQ$Ke>Kdp>VXH215 z>?!RnEjc(mK$LQ4TYO|VM`SZ^mMn6yu219&P@KncbHQ}Mz-iIC@b3D>as`<6LW7k+ zY&R5Ictm)tej(T$n{+ZY?-E6vpoypVm6gS>_S(eSu{p=OL}%sZn9@Gkx9>76V?gL?R?h3EGT%}yDd`P)i zd9U(r_cH6QT7pGA06zYoqe>ik5=~4!akbWM-%&KWFKMn5n>+=p-`aJJpV^G2b7nU z=ar|GM{(Z2Mfs%i5yimyzM$x^2T+x_Dr=Mjl`f?%`XABXMW01Pz(=BAiry5xCVFXf zfAoxKHkybIp%VYkr+`lZp8`Gwd9iqKfdv~ z+vZn(eEq?f5$n%J=WCz;d{_0yG4tkDemwHj=$z`0Z#*}<`s0D{tm=;&T4q*%T-!9G z`Xh68XZ6R%JCw=~I5h+st^yHorUD0l*#57Mel4m;Ly?b1 zj`sckegA*o|G(Vh`~S1h808+{|6ld}|9$^|w=cQx|L^<%BUq2`|8E5?^8NoUI{5zo zs_*~Lf|c3kgTDQ3|_h+z8kitdMY{owkFeB}|rG*+)sDyVSXGU{B9pu*k8 zoQ-{(dUtFYiVyZ}?B5>WFtBxNynmxBVCgDjsgq6X-P*gkPwU${6yM%Acsv4u#kUO% z4!H^(W-Rt7(7(-5TJybIhpLri??UkJ^;l z=`pf%jMhQsvVSa#++0i5c}|SQC`VLvq0y*YK}`klBBQg{imu(~#By*&j$T3+Tw1qa zrx55?Xtm1-=WAnB&ox>w?56*}3x}D;j5tfO0tHyXDr`POu)*BiNZ_Oj3)CJsxiR5F z7g)o7%sl72HjG*|cKC%mPrR-&{NNUuRKQP4l%W5w*?7ln<^QX#IeL8ECzPwOm9t&p&+fy$4irOS+J60eC0BbHacCq`Em z>Rub9H-p~>Jqo=eXtd;0c|D!Vx^PwRCaY85z^|QR?~amS@OKG325O`cu~qD(cC{q? zx&{&>?IG}^70F+q#}$%l&OkNEbgt;?uhtthT8qg9Lc*INWQDp;Y%n^?+2KUC*NO@w zVyynoh$azmyM$LN^jxf&xrV|)# z6Vq)hij6CASJ0Tjb^)8soNYj;;9B5{;6h?awa7h-z-MQgYkOA|z^{-)a!$~QqJ0y! z?RY(*jF{GZF-^hPP}Qxn;~Vpt)su{7RF){Q$m>Ev(!l$j!{syvL`=~(lLPT}38y3^ z4EINp;nde8G}3+@G-l%=(qf4{qKeQ$h#Z5i>+a6aYcS>%bkM#xo#seXlTU9j=33R+ zF^VJ=s-GjMroBocubmGtF&959A2)v%ixSIZcgAv^}2@yFvfWTEO29=mWeUu{J$LcZg_vjw2lJNC+sYVtAWsKBM zQ&;=Cq?3bS_gbo-1^Itt?`BtjM@Z_b7aEOtIVRy6IrTCbib2Vo!R~}DF^SiG#dfv( zJFo^KT^>xUZ6n7}J<;eKHJ@!T-|fL3#@@OR1|P%Oh+umKx7Yz5CnnVU zf<}Z%RMuvfs@dial`ew?2(2hdNjTV-tT6pNV$kw%&qIkg=e;Nqm$1W^Xm)t5-tN#s zmQ#OEl1)uho1=t2RaFR_`De8}&*flu7J$ zG0ndQrziD160A3=wD-{HXg{0ow0PNr8C1lZ--WP*F7)3ZX>$?e3#H+V1YZ=UW6=1=5e zr#?+hMfD0&RMoI_QHP8_NtkOn%`Dn#3H+~g^>4%JU8XP4?!h`EP25B`q#i~s7|%$0 zp*GZAEisFz6CWlE!4S_&+Ce46*1kxpRnk63lk4~LE&V8(9vTxKNhR9~YOZM*SE66{ zp>pbYn4K<%o={gR`XES0iQe-i{hCd~)n60iuP%|vk$Y%5KPVZ;C3K_OiBci#Y`TFr z%0@+{7A5r}qm^9}P}1c0C3HI(s8aMiM$!c|OiE0B-a5VO>J6+qDxtkPMmKnX1d(+V zWwhB87pn=nYUs77KB_MpEx7_-X^juNW%3SstAxlv%#yT(v=j7DV0|Xk4y`nf+F~#g zaSHFE5!7f@)Mq?4<-gMlh5Al%8~IG-zEwOa?3LjEw>o>}6OwYrQ3I^qE6Ke?GOyG( z@^^lM%u)7`X0;#D?WkX;h7NisT6y!)X$tS(@lx`rAEh_$5WT{zn5LR^jAl+P(=aV< z?Ayti0DTl8OsJ=k(3f#>q!*K-B&Wv6ZxZ>7gq12+;AJWOtm zUjYTF&GAVZuD;(0nQyQ1T&UhguS_ycm3p#7tFNv1x(_)WM}kxls$BQ^48{)B%Banx z;cA||C>MJ4IU~%jUR=X^tMt<0G@56=LCv(&fEKm6jnL?7Yw7%0_!~#@;;suxEh*j3 zZvwwi4KJ0R2{%eO>}}FEPViD`s(9g*LSVI|nO{lZE9rgXDsm9m>><^!>iRmz_ax0$ z3U1g4LkpgNG)y*X4Ov5$G~J4w7n9#6Te*Z>_Zo9EiELsFw~>>04l{$tz0>K$Fni2c z$hS+bEOeZi*>TtZb5US?0e_1NRcd1)UoPT(Q2`ERN@dR!5^(UOSVYK!)rr%v^GT38 zE!j!{UTm~L=TMwTv1d+&lxHMZKu&M8pAY8qYaal-%wC}v>(Oqj%pX2G+~Ln!ij;vSRFpq@ohSUZ{~!e^$zn;(Y!v>yFRj=S$q`F6EW)h! zXIXIbGmhl+)PQN3jL}=Gk`CXB#xzj1gMACh8aSA?OhAXO7npvPuSQ)(&VIFzT5V78 z7^aiz4#nM&IP?S_zY;OfaJSi9(D&$gB;J*B0~m39w?646Bd)=TuD3z;SBDcfQ@zR= zVc0Oio=Bj}ZzPLu9|&BU%)3#=p_byQar6LI1S9SqoJG!S+zf`tYsWMIxv6$utKTMD zH1{{_^>e;I8ji9=CG6^hTzP*(_vc`m<+Zm+;Jp&~umsL-ET}29ks7#HNY>Skr%`{1 z1gl%>GR;*|8RH>GkHS-0eTf{G06F4VSqM(vK|(JfJD@>bILDN{@djNnK3d3S;>BFK zkaTnCv)Iq^s&yujPMxmDOF8ytoyhYVl;?tq9RtK%H)-UP07Tv4&^DP z?MKgQ=akAe%34lVw~op4buwK zvK4mzxb`mLAyiV?pq-B88Si~2ciYCQvV2zrjV|^KDW2?%k0w%S^h8o4G?|MMBO`J1 z$Pd^0bOPc~Q>iC%_>+?$zM{%ysJ!^6vP5$7?;p0*VloHDv!7lKpj zY#eiPV`EUJ7$MBdDNb*sc1_Q_aIP`aaTYvoN8gUV_|{&n&oxzN^X6@}?raB!Y~P*AbM9?IuEVNpvjAV^tc?P6waZvrNL)h}c@qhJv(a8E zq!@AI7&I5=eZ2ZiXyQP!vV*Z;=X0=U6!P}k>$Lzp0qdvz2(h`wV%@PGKz0>gAkXJ4 zCU>lu&%0j!`t4rqB|c?rGCktBI^T@>hZf41KXvnTnBQ`}v}T*wn$-1@%nWxXIomIp z9^1!^xgy&S4a7HW>FXVIvHr4Zbvp*P#JBWsAKD(@*zeA^bQ-g0keO~lyHO-OLPtDA z+KcE@!POG2QcHcI4tKpF>};Zlu5B`99m|5AmmoBDQwb+RSN0TSK(^7lkK8aNCS(@a zEOZtUsUkMrT*2O=#R}CA&_{Tt6tcXp1f+0wqrm$}uzDzeiOtso$D_7XF_netD_e7v zdxPtOn=i;ToWzk5_N^3JxDBVk%xa!G;T%WhR)Po?Vk1wHqh&t1}K%=kzsjkfa8r73!(w^+1@qHIb)JsU{&!oq~ zHjrv*ir)QgSOZpckif1I*l1D&_NI z&JJqCEP)-Z*Lgyy;p%!Pr-i(9BI*_j^petyUfEZ{LQ_YOlLM+d`l9Xk;T0~LJlD9d}!jTH=FwU~B>@QN>BfCs}C*9994ZfcmDtLi7Vu?6+p1^8d9u~3F zcA{oNa}LEKJw+~)%(O>n44W#J3j$ko*WC+#D7aP&h3}JWZR&5NtGUF;g0EIvDd`z5 zD2Vw0HyL`owVSJ?gLEBX=l@_>4d6fj&!>P-0iOc@RSH~o#KI^SnC6BduhTzMM&)p4 z3`=>_S-mGeY3aT0{)_X?Q~*2MEo^G_U+;Z*Ix2uAF8!CQ$th#?>qK%&J;*~Duwoj= z@!R;BA(d48y!$CCd%nj zJW&{f2qq}B|LVw8lwy!~i(Sbp1i>xVpQv%E+YR!ASVB+J9q2Pk|I z7mc7b6f(6|971LA1SZ;6KO!F_1Zyd#bsc(6p`yRh$PURy?`jg^J+}3?Q1#`uv?Ty< zq0zY3rO_42{F%n_xQdcX$c=-geSj3e=TPMTBGntMogP6gLc`TJQ?=mg-V!!>EVMSd z3avu|p98mYx2YXpA#9Z?1{$awu7%7ekeQFB3HA_7grR)P>q$hFLqMy-q;<)LMTW0b zgG5g8yVgm=aTFuRr_Z@J$RIKb&YD7Mit6|-3ixhgUfjA}g(YJeF5)Ewp|XZbtW%3h zy0@B|8e8N`$-eq)=KJYE{sgA1K3v9?jnLUQlJu?URE4~V$Y$9g3edlh1FC+7>KX2o ztQ&fvhJF$UI3ZJO`8`y<-{&u$gu35D*8OD?it8F;N4RKulMx;_ROv5JOgT{13kb;O zrnUQ?K~;G<)w+gT50Z!%P(-|u-XGR^F}Wb~93!Wcknd#UpsIwc+xK|Icxo(*I189S zE>i;B|1pj=uuFg*tcpzc1S{;!I@Bm z8N1U|)7Ft~e4JFF3w6T(kn^YDlhkijwn-kW9Z5EI64=-lEco0opOXwHEQm3b|Nqk0R`9rbo+#tW?l`I{agWqJMaf?1orh6yFBcb5XVjC;UBIUAqEPMU^r zDh83pK5R1Lqvpu>*AjdcQ85R9gs zCV|5#b{;dqN@Za;g(!nb7KbuB7EkAxRfB-WR~&`60EIuBZDcBdpQ*&km@5LR!8_Gck*-S&a3CvExEsLj;p9qKsZx^g>X6| z)Dl7$VPH@4?$0FfU(j&%6C_rMmgdYX`KMR5iDNQ5ADY^DYOi^c-#i4X|%@ItZp_shc+SJsXj6U?NBjq zda4UKCaOZe0Efj=ay*x+nd)DXP4#>0^1?E##t3vIkuh+sE#J4Vl8n$#B{W*lV9XrJ zO=Q^>uPNRT>wwyCG;K(yV5jV2phMblbMQ4ZtHL&loX_x^B;mi5z?s87v8OTbS=%*C=+ak> zM$@at)%_n-)>h_m&6i*y5UW;FLdyL?$vV%U2DOjEXhycV@)_}f=}gnw-W5h$$n#l2 z4i}Ox@fO2{yjyYbLO{7f0tz_W#To_-LY8NZMK;!$X<6pyA2%h(_^|1u>&pEX*M#J& zVw%ZC_NgGNPdKyIGzl!|B^K9H$i5U&;LO>L!vwR{cO@Sbq-Ug1$6H)$fRK1?&`YWn zvZhEOfX|}tBoDEC7R>;oWDHql@ z@F`G-0vDVaQv&;f6<-fyu;wD!v?cGxMR#|iTpC|p^Atwhd?LfS_*D*#%fIq1c&C~|{V7u(jvp(h-TIXA=KnhCQ)-*h0l{P!yHF|QawWnsL@^xe zd119h$?&!W;(OHI<968M!wg;{*@1vaS-dw6QO5|THhlo)7YDcCPphs4YRRZ`;9&e!t8QYAO#Bec}Hlx)FDT-imDkR}8 zCj0V1057M(D@nxH(1@?)qz|F;b+UvR9$n?PC3P7b0I?tA%?Q}$J5OE+ITjnUSPo=h z1L4YqeJj5-5;&zY`)Z%*aj`!dDd#IwZET}8)y6imskYRf>a|ntq01^$t=zMBNhFY& zev8cKFICVp%!jb2w6|oo_}I@oop0}!cfOTtIkh+v81QJBZtBjPt)}k0*{i85CLSH* zn!0IpQIvZu??*?~EDbJ$8p~BEj`%uE#BeD5*#5_?#`izQ8o!7=%!WUa!YdD0l0CAs z(zwTB#=n)v42NnOt@BnMT}JJxEKIfDY)4Gbre@viFoIb#4dbA-hT<5?4E1Y$13QL9 z85COE*JDZZW~0NYu6J;=+tBA~N`9_Cq(tU@-%wx>R*Fr06NNpf2cIzpkPA7FN_tB?ZgqM-&KvicK;L(Edd98X( zb*#hroXjYj9LTQ$lUnU3Jf0}e0=`B>-J%b=?0%+J$@)e5xGL0mA9>E?vuFVv=cr?^ zkO>NQ>H^?{jOd8->pHPt>f9@e&k_nXf|p4>$R@bt>7^l5nk&22Ik&o;Ooa-~d$w8) zwRScpVnMgw%j63nOe{fE$2LRAc_BHVM1_6XTPVNf5%b64X-wO5rY%n!3rdx-3! zKIx&*osH3RqIe1DK1BEKhY~tIPs3GQYhXw2#Q(EwY5QPK+SabeQdoU+Oh|%$yLve+ zq~NBT_XHBs9)!<}{musuSO3kXC=X;&Ev#|nq3Q~{23T#FYXqN{y~0#|IY>zP9uzdZ z2d0pi0G8?81dA3Z!21EW%ZSSBvpyEMb7_qpy6-$nzq83{H33^aF!>%Fc=m8!_ zK%vrmXw)Yx$XUP4H^%(3Q;w~OUb2v^fM|L=iRA)vG$C=sKJNa>d5L&%(v zu*iiHf;wf#8g@wVgmRAvOAB0g0?M?ko)54Y%(9O7*O$L)$dbO{)<#bVG&5FMF>Y0VisxG zVu<;u^?6U1Z;{J7L>6LIIg&;Q52IBNZbFBDLC>Op!Sc)GC3yPg69^2-9IW|r+Iz_% z`6|g_>IIV4S#W1cBo|u3ibP5tE9kCjpFv!JVq%XD{{eF6sQAJNt>8+LTuvSUoEl+v zsZ59#+6)>ig!h*WzJ|^ zCYFQRwL|@_d`V>nD4n&}&3v8gML}v#?UM`;ii$Ni;TP@OrJKlz;&?JQu=DEx%&CdAtR&FCEigp$NiJliYS7R`Oo^ z)C(&ZEafgTe~j*}3hiSpYpFLFtvjLjDi7-$w}=(N7O0{K5imQ_n;Y?#BROaf(!712 zB>Mr9{$pY{ffw~G;!Fx?&6O0GB=;~4fjUY0Mo~W{ur_I0S(Wf*UlOBlalRCJ>6rKUP{_XvU8~&s z@F}X}Tg?NRqbHQ)8TjY8#`xE0mlDKA5ski%L~$%$3r!W{dk~R(gitZS$%4PK;?aTK zw0vv&xMXNpP&Y~B}tESN~PXp~b zI=6Z+)t|G_AM4IZsH2C*Q2PYQm^T_7&X5vy?U8W8VRRensp89hB3|U&JqIh0R3#lo z4^2B6VVhy{W-2?P>Okc^06ZjGf} znZ@)Hke;Ax(QcBS(4K(&kV&wxm_kKw1Krt)+B#0V5-*mF;WOlPU*}DbuhIj53@z7S z3vh{=rkJU{B-?t_cStsGb{9pC1Qsp%w7Di|%r2G@PPbT!53>*Ju;bP@7(kQa@==mP zczll#UNz|KTdJwb2bR~S)i*g}>RfW3vDr##EF~9u zuwJP>=CKullht};ju0gF(p78s({T2TtnG};_ZvD zZSb&+=O+;Z9!GW4*wTsr-~7cudow=#Kc50V1$+wl6!0nFQ{bPQ0vD*uqk&Y#E5K-} zvDA0DK$t({c}>4*rjKuCrvDQC*vylcKfL?3>6qy+_H1#==y`o8PO0IF zEK_Tszf?H;)mp8G1tnXQvu#u(j&vqKJ2+8WNNC~Cr9wi)Hc=CR^98)dg|q`?O{}X( zdP4p#c7!$`nH0cBRQ1RL9C~NxAx;4v5$_URY23+GNLfwe6yc6ALjFY_9bZ`qY%Ub( zmLQCm(`~_5$lhc`aO1+?zY8uyGitQ1Lh`|s-qsO=AKA4`2^1@{Y2PJved4U?#5_xq zrUQOuPIV4jbE6m#LaZHvN5BwJT0@@HF?qDRG+11~XV66Zosk@Am zy!6iOHB91KK^tx`lx^#G;KG-4kK_-i+J5q2{Uu3#4@fl@r&j1L_VQ#81$%^UMLok= z^Lb1wp%eum!aUlqFz{e)5qVhwJt3*)yocrs^~;pbp#_<#yRQ#Zgj5eB$3^X?!RJX> MNc&(AE7Q#X1cwE+Qvd(} literal 0 HcmV?d00001 diff --git a/airflow/.env b/airflow/.env deleted file mode 100644 index d77f37a..0000000 --- a/airflow/.env +++ /dev/null @@ -1,23 +0,0 @@ -AIRFLOW_IMAGE_NAME=apache/airflow:2.10.4 -_AIRFLOW_WWW_USER_USERNAME=airflow -_AIRFLOW_WWW_USER_PASSWORD=airflow-password-ytld -AIRFLOW_UID=50000 -AIRFLOW_PROJ_DIR=. - -AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow-new-super-pass@89.253.221.173:52919/airflow -AIRFLOW__CELERY__RESULT_BACKEND=db+postgresql://airflow:airflow-new-super-pass@89.253.221.173:52919/airflow -AIRFLOW__CELERY__BROKER_URL=redis://:rOhTAIlTFFylXsjhqwxnYxDChFc@89.253.221.173:52909/0 - -AIRFLOW_QUEUE=holisticlegs-download -AIRFLOW_QUEUE_CHECK=holisticlegs-check -AIRFLOW_QUEUE_UPLOAD=holisticlegs-upload -AIRFLOW__WEBSERVER__SECRET_KEY=8DJ6XbtIICassrVxM9jWV3eTlt5N3XtyEdyW -HOSTNAME=85.192.30.55 - -AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT=768M -AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV=522M -AIRFLOW_WORKER_DOWNLOAD_CONCURRENCY=2 - -AIRFLOW_SMALL_WORKERS_MEM_LIMIT=1024M -AIRFLOW_SMALL_WORKERS_MEM_RESERV=512M -~ \ No newline at end of file diff --git a/airflow/.env.example b/airflow/.env.example deleted file mode 100644 index d1e9310..0000000 --- a/airflow/.env.example +++ /dev/null @@ -1,60 +0,0 @@ -# This file contains all environment variables for the Airflow-based deployment. -# Copy this file to .env in the same directory and fill in your production values. -# This file is used by `generate_envoy_config.py` and `docker-compose-ytdlp-ops.yaml`. - -# --- Common Configuration --- -# A unique name for this server instance, used as a key in Redis. -# This is hardcoded in the docker-compose file but can be overridden here. -SERVER_IDENTITY=ytdlp-ops-airflow-service - -# Redis connection details for proxy and account state management. -REDIS_HOST=redis -REDIS_PORT=6379 -REDIS_PASSWORD=redis_pwd_K3fG8hJ1mN5pQ2sT - -# --- Airflow Database Configuration --- -# The password for the PostgreSQL database used by Airflow. -# This should be a secure, randomly generated password. -POSTGRES_PASSWORD=pgdb_pwd_A7bC2xY9zE1wV5uP - -# The password for the Airflow web UI admin user. -AIRFLOW_ADMIN_PASSWORD=admin_pwd_X9yZ3aB1cE5dF7gH - -# --- Envoy & Worker Configuration --- -# The public-facing port for the Envoy load balancer that fronts the WORKERS. -ENVOY_PORT=9080 -# The port for Envoy's admin/stats interface. -ENVOY_ADMIN_PORT=9901 -# The public-facing port for the standalone MANAGEMENT service. -MANAGEMENT_SERVICE_PORT=9091 -# The number of Python server workers to run. -# Set to 1 to simplify debugging. Multi-worker mode is experimental. -YTDLP_WORKERS=1 -# The starting port for the Python workers. They will use sequential ports (e.g., 9090, 9091, ...). -YTDLP_BASE_PORT=9090 - -# --- Camoufox (Browser) Configuration --- -# Comma-separated list of SOCKS5 proxies to be used by Camoufox instances. -# Each proxy will get its own dedicated browser instance. -# Example: CAMOUFOX_PROXIES="socks5://user:pass@p.webshare.io:80,socks5://user:pass@p.webshare.io:81" -CAMOUFOX_PROXIES="socks5://your_proxy_user:your_proxy_pass@proxy.example.com:1080,socks5://your_proxy_user:your_proxy_pass@proxy.example.com:1081" - -# Password for VNC access to the Camoufox browser instances. -VNC_PASSWORD=vnc_pwd_Z5xW8cV2bN4mP7lK - -# The starting port for VNC access. Ports will be assigned sequentially (e.g., 5901, 5902, ...). -CAMOUFOX_BASE_VNC_PORT=5901 - -# The internal port used by Camoufox for its WebSocket server. Usually does not need to be changed. -CAMOUFOX_PORT=12345 - -# --- General Proxy Configuration --- -# A general-purpose SOCKS5 proxy that can be used alongside Camoufox proxies. -# This should be the IP address of the proxy server accessible from within the Docker network. -# '172.17.0.1' is often the host IP from within a container. -SOCKS5_SOCK_SERVER_IP=172.17.0.1 - -# --- Account Manager Configuration --- -# Account cooldown parameters (values are in minutes). -ACCOUNT_ACTIVE_DURATION_MIN=30 -ACCOUNT_COOLDOWN_DURATION_MIN=60 diff --git a/airflow/dags-backup/old-download-dag.py b/airflow/dags-backup/old-download-dag.py deleted file mode 100644 index 6a6671e..0000000 --- a/airflow/dags-backup/old-download-dag.py +++ /dev/null @@ -1,1167 +0,0 @@ -# -*- coding: utf-8 -*- -# vim:fenc=utf-8 -# -# Copyright © 2024 rl -# -# Distributed under terms of the MIT license. - -""" -DAG for processing a single YouTube URL passed via DAG run configuration. -This is the "Worker" part of a Sensor/Worker pattern. -""" - -from airflow import DAG -from airflow.exceptions import AirflowException, AirflowSkipException -from airflow.models import BaseOperator, Variable -from airflow.models.param import Param -from airflow.operators.bash import BashOperator -from airflow.operators.dummy import DummyOperator -from airflow.operators.python import PythonOperator, BranchPythonOperator -from airflow.operators.dummy import DummyOperator -from airflow.providers.redis.hooks.redis import RedisHook -from airflow.utils.dates import days_ago -from airflow.utils.decorators import apply_defaults -from airflow.utils.task_group import TaskGroup -from datetime import datetime, timedelta -from airflow.api.common.trigger_dag import trigger_dag -from pangramia.yt.common.ttypes import TokenUpdateMode -from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException -from pangramia.yt.tokens_ops import YTTokenOpService -from thrift.protocol import TBinaryProtocol -from thrift.transport import TSocket, TTransport -from thrift.transport.TTransport import TTransportException -import json -import logging -import os -import random -import redis -import socket -import time -import traceback -import inspect -import uuid -import uuid - -# Import utility functions -from utils.redis_utils import _get_redis_client - -# Configure logging -logger = logging.getLogger(__name__) - -# Default settings -DEFAULT_QUEUE_NAME = 'video_queue' -DEFAULT_REDIS_CONN_ID = 'redis_default' -DEFAULT_MAX_URLS = 1 -DEFAULT_TIMEOUT = 180 # Default Thrift timeout in seconds - -DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="16.162.82.212") -DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080) - -# --- Helper Functions --- - -def _get_thrift_client(host, port, timeout): - """Helper to create and connect a Thrift client.""" - transport = TSocket.TSocket(host, port) - transport.setTimeout(timeout * 1000) - transport = TTransport.TFramedTransport(transport) - protocol = TBinaryProtocol.TBinaryProtocol(transport) - client = YTTokenOpService.Client(protocol) - transport.open() - logger.info(f"Connected to Thrift server at {host}:{port}") - return client, transport - - - - -def _extract_video_id(url): - """Extracts YouTube video ID from URL.""" - if not url or not isinstance(url, str): - logger.debug("URL is empty or not a string, cannot extract video ID.") - return None - try: - video_id = None - if 'youtube.com/watch?v=' in url: - video_id = url.split('v=')[1].split('&')[0] - elif 'youtu.be/' in url: - video_id = url.split('youtu.be/')[1].split('?')[0] - - if video_id and len(video_id) >= 11: - video_id = video_id[:11] # Standard ID length - logger.debug(f"Extracted video ID '{video_id}' from URL: {url}") - return video_id - else: - logger.debug(f"Could not extract a standard video ID pattern from URL: {url}") - return None - except Exception as e: - logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}") - return None - -# --- Queue Management Callables (for success/failure reporting) --- - - - -def mark_url_as_success(**context): - """Moves URL from progress to result hash on success.""" - ti = context['task_instance'] - params = context['params'] - url = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='url_to_process') - if not url: - logger.warning("mark_url_as_success called but no URL found in DAG run parameters.") - return - - queue_name = params['queue_name'] - result_queue = f"{queue_name}_result" - redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID) - - # Pull results from previous tasks - info_json_path = ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='info_json_path') or \ - ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='info_json_path') - socks_proxy = ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='socks_proxy') or \ - ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='socks_proxy') - ytdlp_command = ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='ytdlp_command') or \ - ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='ytdlp_command') - downloaded_file_path = ti.xcom_pull(task_ids='download_and_probe') - - logger.info(f"Handling success for URL: {url}") - logger.info(f" Downloaded File Path: {downloaded_file_path}") - - result_data = { - 'status': 'success', - 'end_time': time.time(), - 'info_json_path': info_json_path, - 'socks_proxy': socks_proxy, - 'ytdlp_command': ytdlp_command, - 'downloaded_file_path': downloaded_file_path, - 'url': url, - 'dag_run_id': context['dag_run'].run_id, - } - - try: - # In the worker pattern, there's no "progress" hash to remove from. - # We just add the result to the success hash. - client = _get_redis_client(redis_conn_id) - client.hset(result_queue, url, json.dumps(result_data)) - logger.info(f"Stored success result for URL '{url}' in result hash '{result_queue}'.") - except Exception as e: - logger.error(f"Error handling success in Redis for URL '{url}': {e}", exc_info=True) - # Log error but don't fail the task, as the main work succeeded. - - -def handle_failure_callable(**context): - """ - Handles a failed processing run by recording rich, detailed error information to Redis. - """ - ti = context['task_instance'] - params = context['params'] - dag_run = context['dag_run'] - url = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='url_to_process') - - if not url: - # This can happen if pull_url_and_assign_account itself fails. - # We can't record a URL-specific failure, but we should log it. - failed_tis = [ti for ti in dag_run.get_task_instances() if ti.state == 'failed'] - failed_task_ids = [ti.task_id for ti in failed_tis] - logger.error(f"handle_failure_callable was triggered for run {dag_run.run_id}, but no URL was found in XCom. " - f"This likely means an early task failed. Failed tasks in run: {failed_task_ids}") - return - - # --- Start building the rich error report --- - failure_report = { - 'url': url, - 'dag_run_id': dag_run.run_id, - 'failure_timestamp': datetime.now().isoformat(), - 'failed_task': 'unknown', - 'failure_summary': 'An unknown error occurred.', - 'failure_history': [], - 'download_error': None, - 'generic_error': None - } - - # --- Gather data from token acquisition attempts --- - # Attempt 1: get_token - get_token_ti = dag_run.get_task_instance('acquire_token_with_retry.get_token') - if get_token_ti: - error_details_1 = ti.xcom_pull(task_ids=get_token_ti.task_id, key='error_details') - account_1 = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='account_id') - - attempt_1_report = { - 'task_id': get_token_ti.task_id, - 'account_id': account_1, - 'status': get_token_ti.state, - 'start_date': get_token_ti.start_date.isoformat() if get_token_ti.start_date else None, - 'end_date': get_token_ti.end_date.isoformat() if get_token_ti.end_date else None, - } - if error_details_1: - attempt_1_report.update({ - 'proxy_url': error_details_1.get('proxy_url'), - 'error_code': error_details_1.get('error_code'), - 'error_message': error_details_1.get('error_message'), - }) - failure_report['failure_history'].append(attempt_1_report) - - # Attempt 2: retry_get_token - retry_get_token_ti = dag_run.get_task_instance('acquire_token_with_retry.retry_get_token') - # Only report on retry if it actually ran - if retry_get_token_ti and retry_get_token_ti.state: - error_details_2 = ti.xcom_pull(task_ids=retry_get_token_ti.task_id, key='error_details') - account_2 = ti.xcom_pull(task_ids='acquire_token_with_retry.assign_new_account_for_retry', key='account_id') - - attempt_2_report = { - 'task_id': retry_get_token_ti.task_id, - 'account_id': account_2, - 'status': retry_get_token_ti.state, - 'start_date': retry_get_token_ti.start_date.isoformat() if retry_get_token_ti.start_date else None, - 'end_date': retry_get_token_ti.end_date.isoformat() if retry_get_token_ti.end_date else None, - } - if error_details_2: - attempt_2_report.update({ - 'proxy_url': error_details_2.get('proxy_url'), - 'error_code': error_details_2.get('error_code'), - 'error_message': error_details_2.get('error_message'), - }) - failure_report['failure_history'].append(attempt_2_report) - - # --- Identify the primary failure point --- - exception = context.get('exception') - - # Case 1: Download & Probe failure - download_probe_ti = dag_run.get_task_instance('download_and_probe') - if download_probe_ti and download_probe_ti.state == 'failed': - failure_report['failed_task'] = download_probe_ti.task_id - failure_report['failure_summary'] = 'Download or probe failed after successful token acquisition.' - failure_report['download_error'] = { - 'error_message': str(exception) if exception else "BashOperator failed. Check task logs for yt-dlp/ffmpeg output.", - 'error_type': type(exception).__name__ if exception else "Unknown", - } - - # Case 2: Token acquisition failure - else: - last_failed_attempt = next((attempt for attempt in reversed(failure_report['failure_history']) if attempt['status'] == 'failed'), None) - if last_failed_attempt: - failure_report['failed_task'] = last_failed_attempt['task_id'] - failure_report['failure_summary'] = f"Token acquisition failed with error: {last_failed_attempt.get('error_code', 'Unknown')}" - else: - # Case 3: Generic/unexpected failure - failed_tis = [ti for ti in dag_run.get_task_instances() if ti.state == 'failed'] - if failed_tis: - # Heuristic: pick the one with the latest end_date that is not this task itself - failed_tis.sort(key=lambda x: x.end_date or datetime.min) - last_failed_ti = next((ti for ti in reversed(failed_tis) if ti.task_id != context['task_instance'].task_id), None) - if last_failed_ti: - failure_report['failed_task'] = last_failed_ti.task_id - failure_report['failure_summary'] = f"Task '{last_failed_ti.task_id}' failed unexpectedly." - failure_report['generic_error'] = { - 'error_message': str(exception) if exception else f"Unexpected failure in task {last_failed_ti.task_id}. Check logs.", - 'error_type': type(exception).__name__ if exception else "Unknown", - 'traceback': "".join(traceback.format_exception(etype=type(exception), value=exception, tb=exception.__traceback__)) if exception else "No traceback available." - } - - logger.info(f"Handling failure for URL: {url}") - logger.error(f" Failure Summary: {failure_report['failure_summary']}") - logger.error(f" Failed Task: {failure_report['failed_task']}") - # Using print to ensure the full JSON is visible in the logs without truncation - print("--- Detailed Failure Report ---") - print(json.dumps(failure_report, indent=2)) - print("-----------------------------") - - # For all failures, mark the URL as failed in Redis. - redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID) - queue_name = params['queue_name'] - fail_queue = f"{queue_name}_fail" - try: - client = _get_redis_client(redis_conn_id) - client.hset(fail_queue, url, json.dumps(failure_report, indent=2)) - logger.info(f"Stored detailed failure info for URL '{url}' in fail hash '{fail_queue}'.") - except Exception as e: - logger.error(f"Critical error during failure handling in Redis for URL '{url}': {e}", exc_info=True) - raise AirflowException(f"Could not handle failure in Redis: {e}") - -# --- YtdlpOpsOperator --- - -def _get_account_pool(params: dict) -> list: - """ - Gets the list of accounts to use for processing, filtering out banned/resting accounts. - Supports three modes for the 'account_pool' parameter: - 1. Explicit List: If 'account_pool' contains a comma, it's treated as a comma-separated list. - 2. Prefix-based Generation: If 'account_pool_size' is provided, 'account_pool' is treated as a prefix - to generate numbered accounts (e.g., prefix_01, prefix_02). - 3. Single Account: If 'account_pool' has no comma and 'account_pool_size' is not provided, it's treated as a single account name. - If the pool is exhausted and auto-creation is enabled, it will generate a new account ID. - """ - account_pool_str = params.get('account_pool', 'default_account') - accounts = [] - is_prefix_mode = False - - if ',' in account_pool_str: - # Mode 1: Explicit comma-separated list - logger.info("Detected comma in 'account_pool', treating as an explicit list.") - accounts = [acc.strip() for acc in account_pool_str.split(',') if acc.strip()] - else: - # Mode 2 or 3: Prefix-based generation OR single account - prefix = account_pool_str - pool_size_param = params.get('account_pool_size') - - if pool_size_param is not None: - # Mode 2: Prefix mode - is_prefix_mode = True - logger.info("Detected 'account_pool_size', treating 'account_pool' as a prefix.") - - try: - pool_size = int(pool_size_param) - if pool_size <= 0: - raise AirflowException("'account_pool_size' must be a positive integer for prefix-based generation.") - except (ValueError, TypeError): - raise AirflowException(f"'account_pool_size' must be an integer, but got: {pool_size_param}") - - logger.info(f"Account pool size is set to: {pool_size}") - - # Generate accounts like 'prefix_01', 'prefix_02', ..., 'prefix_10' - for i in range(1, pool_size + 1): - accounts.append(f"{prefix}_{i:02d}") - else: - # Mode 3: Single account mode - logger.info("No 'account_pool_size' provided. Treating 'account_pool' as a single account name.") - accounts = [prefix] - - if not accounts: - raise AirflowException("Initial account pool is empty. Please check 'account_pool' and 'account_pool_size' parameters.") - - logger.info(f"Generated initial account pool with {len(accounts)} accounts: {accounts}") - - # --- Filter out banned/resting accounts by checking Redis --- - redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID) - try: - redis_client = _get_redis_client(redis_conn_id) - active_accounts = [] - for account in accounts: - status_key = f"account_status:{account}" - status_bytes = redis_client.hget(status_key, "status") - status = status_bytes.decode('utf-8') if status_bytes else "ACTIVE" - - if status == 'BANNED': - logger.warning(f"Account '{account}' is BANNED. Skipping.") - continue - if 'RESTING' in status: # Check for 'RESTING' or 'RESTING (active in...)' - logger.info(f"Account '{account}' is RESTING. Skipping.") - continue - - active_accounts.append(account) - - if not active_accounts and accounts: - logger.error(f"All {len(accounts)} accounts in the pool are banned or resting.") - - auto_create = params.get('auto_create_new_accounts_on_exhaustion', False) - if auto_create and is_prefix_mode: - prefix = account_pool_str - new_account_id = f"{prefix}-auto-{str(uuid.uuid4())[:8]}" - logger.warning(f"Account pool exhausted. Auto-creating new account: '{new_account_id}'") - active_accounts.append(new_account_id) - else: - if not auto_create: - logger.error("Auto-creation is disabled. No workers can be scheduled.") - if not is_prefix_mode: - logger.error("Auto-creation is only supported for prefix-based account pools.") - raise AirflowException("All accounts in the configured pool are currently exhausted (banned or resting).") - - if len(active_accounts) < len(accounts): - logger.info(f"Filtered account pool. Using {len(active_accounts)} of {len(accounts)} available accounts.") - - accounts = active_accounts - - except Exception as e: - logger.error(f"Could not filter accounts by status from Redis. Using unfiltered pool. Error: {e}", exc_info=True) - - if not accounts: - raise AirflowException("Account pool is empty after filtering. Please check account statuses in Redis or enable auto-creation.") - - logger.info(f"Final active account pool with {len(accounts)} accounts: {accounts}") - return accounts - - - -def pull_url_and_assign_account_callable(**context): - """ - Pulls a single URL from Redis and assigns an active account for the run. - If the queue is empty, it skips the DAG run. - Otherwise, it pushes the URL and account details to XCom. - """ - params = context['params'] - ti = context['task_instance'] - - # --- Part 1: Pull URL from Redis --- - queue_name = params['queue_name'] - redis_conn_id = params['redis_conn_id'] - inbox_queue = f"{queue_name}_inbox" - - logger.info(f"Attempting to pull one URL from Redis queue '{inbox_queue}'...") - client = _get_redis_client(redis_conn_id) - url_bytes = client.lpop(inbox_queue) - - if not url_bytes: - logger.info("Queue is empty. Stopping this worker loop.") - raise AirflowSkipException("Redis queue is empty.") - - url_to_process = url_bytes.decode('utf-8') - logger.info(f"Pulled URL '{url_to_process}' from the queue.") - ti.xcom_push(key='url_to_process', value=url_to_process) - - # --- Part 2: Assign Account --- - logger.info("URL found, proceeding to assign an account.") - # Affinity logic: check if an account was passed from a previous run - account_id = params.get('current_account_id') - if account_id: - logger.info(f"Using account '{account_id}' passed from previous run (affinity).") - else: - logger.info("No account passed from previous run. Selecting a new one from the pool.") - account_pool = _get_account_pool(params) - account_id = random.choice(account_pool) - logger.info(f"Selected initial account '{account_id}'.") - - ti.xcom_push(key='account_id', value=account_id) - ti.xcom_push(key='accounts_tried', value=[account_id]) - - -def decide_what_to_do_next_callable(**context): - """ - Decides whether to continue the processing loop by triggering the next worker - or to stop the loop, based on task success, failure, or an empty queue. - """ - params = context['params'] - dag_run = context['dag_run'] - - # Check if a failure was handled. If the 'handle_generic_failure' task was not skipped, - # it means a failure occurred somewhere in the pipeline. - handle_generic_failure_ti = dag_run.get_task_instance(task_id='handle_generic_failure') - if handle_generic_failure_ti and handle_generic_failure_ti.state != 'skipped': - logger.error(f"Failure handler task 'handle_generic_failure' was in state '{handle_generic_failure_ti.state}'. Stopping this processing lane.") - return 'mark_dag_run_as_failed' - - # Check if the worker was skipped because the Redis queue was empty. - pull_task_instance = dag_run.get_task_instance(task_id='pull_url_and_assign_account') - if pull_task_instance and pull_task_instance.state == 'skipped': - logger.info("Worker was skipped because Redis queue was empty.") - retrigger_delay_on_empty_s = params.get('retrigger_delay_on_empty_s', 60) - - if retrigger_delay_on_empty_s < 0: - logger.info(f"retrigger_delay_on_empty_s is {retrigger_delay_on_empty_s}. Stopping this worker loop.") - return 'stop_worker_lane_gracefully' - else: - logger.info(f"Queue is empty. Will re-trigger this worker loop after a delay of {retrigger_delay_on_empty_s}s.") - return 'continue_loop_and_trigger_next_run' - - # If no failure was handled and the queue wasn't empty, it must be a success. - logger.info("All preceding tasks succeeded. Continuing the processing loop by triggering the next worker.") - return 'continue_loop_and_trigger_next_run' - -def get_token_callable(**context): - """Makes a single attempt to get a token from the Thrift service.""" - ti = context['task_instance'] - params = context['params'] - - # Determine which account to use (initial or retry) - # Pull from all upstreams, which might return a LazySelectSequence - xcom_results = ti.xcom_pull(task_ids=context['task'].upstream_task_ids, key='account_id') - - # The result can be a single value or an iterable. We need to find the first valid item. - account_id = None - if hasattr(xcom_results, '__iter__') and not isinstance(xcom_results, str): - # It's a list, tuple, or LazySelectSequence. Find the first real value. - account_id = next((item for item in xcom_results if item is not None), None) - else: - # It's a single value - account_id = xcom_results - - if not account_id: - raise AirflowException("Could not find a valid account_id in XCom from any upstream task.") - - url = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='url_to_process') - if not url: - logger.info("No URL pulled from XCom. Assuming upstream task was skipped. Ending task.") - return - - host = params['service_ip'] - port = int(params['service_port']) - timeout = int(params.get('timeout', DEFAULT_TIMEOUT)) - # The value from templates_dict is already rendered by Airflow. - info_json_dir = context['templates_dict']['info_json_dir'] - machine_id = params.get('machine_id') or socket.gethostname() - clients = params.get('clients') - - logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' ---") - client, transport = None, None - try: - client, transport = _get_thrift_client(host, port, timeout) - client.ping() - - call_kwargs = {'accountId': account_id, 'updateType': TokenUpdateMode.AUTO, 'url': url, 'clients': clients, 'machineId': machine_id} - token_data = client.getOrRefreshToken(**call_kwargs) - - # --- Log response details for debugging --- - response_summary = { - "has_infoJson": hasattr(token_data, 'infoJson') and bool(token_data.infoJson), - "infoJson_size": len(token_data.infoJson) if hasattr(token_data, 'infoJson') and token_data.infoJson else 0, - "has_ytdlpCommand": hasattr(token_data, 'ytdlpCommand') and bool(token_data.ytdlpCommand), - "proxy_type": next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr) and getattr(token_data, attr)), 'None'), - "jobId": getattr(token_data, 'jobId', None) - } - logger.info(f"Successfully retrieved token data from service. Response summary: {json.dumps(response_summary)}") - - # --- Success Case --- - info_json = getattr(token_data, 'infoJson', None) - if info_json and json.loads(info_json): - video_id = _extract_video_id(url) - save_dir = info_json_dir or "." - os.makedirs(save_dir, exist_ok=True) - timestamp = int(time.time()) - base_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json" - info_json_path = os.path.join(save_dir, base_filename) - with open(info_json_path, 'w', encoding='utf-8') as f: - f.write(info_json) - ti.xcom_push(key='info_json_path', value=info_json_path) - - # Log key details from the info.json to confirm success - try: - info_data = json.loads(info_json) - if isinstance(info_data, dict): - title = info_data.get('title', 'N/A') - uploader = info_data.get('uploader', 'N/A') - duration = info_data.get('duration_string', 'N/A') - logger.info(f"Successfully got info.json for video: '{title}' by '{uploader}' ({duration})") - except Exception as log_e: - logger.warning(f"Could not log info.json details: {log_e}") - - proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None) - ti.xcom_push(key='socks_proxy', value=getattr(token_data, proxy_attr) if proxy_attr else None) - ti.xcom_push(key='ytdlp_command', value=getattr(token_data, 'ytdlpCommand', None)) - ti.xcom_push(key='successful_account_id', value=account_id) # For affinity - ti.xcom_push(key='get_token_succeeded', value=True) - else: - # This is a failure case: the service returned success but no usable data. - logger.error(f"Thrift call for account '{account_id}' succeeded but returned no info.json. Treating as failure.") - # The generic failure handler will pick up this exception. - raise AirflowException("Service returned success but info.json was empty or invalid.") - - except (PBServiceException, PBUserException, TTransportException) as e: - error_context = getattr(e, 'context', None) - if isinstance(error_context, str): - try: - error_context = json.loads(error_context.replace("'", "\"")) - except: pass - - error_message = getattr(e, 'message', str(e)) - error_code = getattr(e, 'errorCode', 'TRANSPORT_ERROR') - - # Check for wrapped timeout exception to provide a clearer error message. - inner_exception = getattr(e, 'inner', getattr(e, '__cause__', None)) - if isinstance(e, TTransportException) and isinstance(inner_exception, socket.timeout): - error_message = f"Socket timeout during Thrift call (wrapped in TTransportException)" - error_code = 'SOCKET_TIMEOUT' - - error_details = { - 'error_message': error_message, - 'error_code': error_code, - 'error_type': type(e).__name__, - 'traceback': traceback.format_exc(), - 'proxy_url': error_context.get('proxy_url') if isinstance(error_context, dict) else None - } - - proxy_url_info = f" with proxy '{error_details['proxy_url']}'" if error_details.get('proxy_url') else "" - - if error_code == 'SOCKET_TIMEOUT': - logger.error(f"Thrift call for account '{account_id}'{proxy_url_info} failed due to a socket timeout after {timeout} seconds.") - elif isinstance(e, TTransportException) and e.type == TTransportException.TIMED_OUT: - logger.error(f"Thrift call for account '{account_id}'{proxy_url_info} timed out after {timeout} seconds.") - else: - logger.error(f"Thrift call failed for account '{account_id}'{proxy_url_info}. Exception: {error_details['error_message']}") - - ti.xcom_push(key='error_details', value=error_details) - ti.xcom_push(key='get_token_succeeded', value=False) - - # Always fail the task on any Thrift exception. The branch operator will inspect the failure. - raise AirflowException(f"Thrift call failed: {error_details['error_message']}") - finally: - if transport and transport.isOpen(): - transport.close() - - -def handle_bannable_error_branch_callable(**context): - """ - Inspects a failed `get_token` task. If the failure was a "bannable" error, - it routes to the retry logic. Otherwise, it lets the DAG fail. - This task only runs if the upstream `get_token` task fails. - """ - ti = context['task_instance'] - params = context['params'] - - # We know get_token failed because of the trigger_rule='one_failed'. - # Pull the error details it left behind. - error_details = ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='error_details') - if not error_details: - logger.error("The 'get_token' task failed, but no error details were found in XCom. " - "This indicates an unexpected error. Letting the DAG fail.") - return None # Do nothing, let the group fail. - - # We have error details, now check if the error is "bannable". - error_code = error_details.get('error_code', '').strip() - error_message = error_details.get('error_message', '').lower() - policy = params.get('on_bannable_failure', 'retry_with_new_account') - bannable_codes = ["BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED", "SOCKS5_CONNECTION_FAILED"] - is_bannable = error_code in bannable_codes - - # Override bannable status for age-restricted content, which is not a true bot detection. - if is_bannable and ('confirm your age' in error_message or 'age-restricted' in error_message): - logger.warning(f"Error is age-related ('{error_code}'). Treating as a non-bannable failure to avoid banning the account.") - is_bannable = False - - logger.info(f"Handling failure from 'get_token'. Error code: '{error_code}', Is Bannable: {is_bannable}, Policy: '{policy}'") - - if is_bannable and policy == 'retry_with_new_account': - logger.info("Error is bannable and policy allows retry. Proceeding to ban first account and retry.") - return 'acquire_token_with_retry.ban_account_and_prepare_for_retry' - elif is_bannable: # and policy is 'stop_loop' - logger.warning("Error is bannable and policy is 'stop_loop'. Banning account and stopping.") - return 'acquire_token_with_retry.ban_account_and_fail' - else: # Not a bannable error - logger.error(f"Error '{error_code}' is not bannable. Letting the DAG fail.") - return None # Do nothing, let the group fail. - - -def assign_new_account_for_retry_callable(**context): - """Selects a new, unused account for the retry attempt.""" - ti = context['task_instance'] - params = context['params'] - - accounts_tried = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='accounts_tried') - if not accounts_tried: - raise AirflowException("Cannot retry, list of previously tried accounts not found.") - - logger.info(f"Policy is 'retry_with_new_account'. Selecting a new account. Already tried: {accounts_tried}") - try: - account_pool = _get_account_pool(params) - available_for_retry = [acc for acc in account_pool if acc not in accounts_tried] - - new_account_id = None - if available_for_retry: - new_account_id = random.choice(available_for_retry) - else: - # No unused accounts left in the pool. Check if we can auto-create one. - logger.warning("No unused accounts available in the pool for a retry. Checking for auto-creation.") - auto_create = params.get('auto_create_new_accounts_on_exhaustion', False) - account_pool_str = params.get('account_pool', 'default_account') - pool_size_param = params.get('account_pool_size') - is_prefix_mode = pool_size_param is not None and ',' not in account_pool_str - - if auto_create and is_prefix_mode: - prefix = account_pool_str - new_account_id = f"{prefix}-auto-{str(uuid.uuid4())[:8]}" - logger.warning(f"Auto-creating new account for retry: '{new_account_id}'") - else: - if not auto_create: - logger.error("Auto-creation is disabled.") - if not is_prefix_mode: - logger.error("Auto-creation is only supported for prefix-based account pools (requires 'account_pool_size').") - raise AirflowException("No other accounts available in the pool for a retry.") - - accounts_tried.append(new_account_id) - - logger.info(f"Selected new account for retry: '{new_account_id}'") - ti.xcom_push(key='account_id', value=new_account_id) - ti.xcom_push(key='accounts_tried', value=accounts_tried) - - except Exception as e: - logger.error(f"Could not get a new account for retry: {e}") - raise AirflowException(f"Failed to assign new account for retry: {e}") - - -def handle_retry_failure_branch_callable(**context): - """ - Checks a failed `retry_get_token` task. If the failure was a handled Thrift error, - it triggers the banning of the second account/proxy. - This task only runs if the upstream `retry_get_token` task fails. - """ - ti = context['task_instance'] - - # We know retry_get_token failed. Check if it was a handled failure. - error_details = ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='error_details') - - if not error_details: - logger.error("The 'retry_get_token' task failed unexpectedly before it could record error details. " - "Letting the DAG fail without banning the account/proxy.") - return None - - # If we are here, it means the retry failed with a handled Thrift error. - # We will proceed to ban the second account and proxy. - logger.error("Retry attempt also failed with a handled Thrift error. Banning second account and proxy.") - return 'acquire_token_with_retry.ban_second_account_and_proxy' - - -def ban_first_account_callable(**context): - """Bans the first account that failed due to a bannable error.""" - ti = context['task_instance'] - params = context['params'] - - # The account ID is pulled from the initial assignment task. - account_to_ban = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='account_id') - if not account_to_ban: - logger.warning("Could not find the initial account ID to ban. Skipping.") - return - - client, transport = None, None - try: - host = params['service_ip'] - port = int(params['service_port']) - timeout = int(params.get('timeout', DEFAULT_TIMEOUT)) - client, transport = _get_thrift_client(host, port, timeout) - - reason = "Banned by Airflow worker due to bannable error on first attempt" - logger.warning(f"Banning account '{account_to_ban}'. Reason: {reason}") - client.banAccount(accountId=account_to_ban, reason=reason) - logger.info(f"Successfully sent request to ban account '{account_to_ban}'.") - except Exception as e: - logger.error(f"Failed to issue ban for account '{account_to_ban}': {e}", exc_info=True) - # Don't fail the task, as this is a best-effort cleanup action. - finally: - if transport and transport.isOpen(): - transport.close() - - -def ban_first_account_and_fail_callable(**context): - """Bans the first account that failed, and then intentionally fails the task.""" - ti = context['task_instance'] - params = context['params'] - - # The account ID is pulled from the initial assignment task. - account_to_ban = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='account_id') - if not account_to_ban: - logger.warning("Could not find the initial account ID to ban. Skipping.") - else: - client, transport = None, None - try: - host = params['service_ip'] - port = int(params['service_port']) - timeout = int(params.get('timeout', DEFAULT_TIMEOUT)) - client, transport = _get_thrift_client(host, port, timeout) - - reason = "Banned by Airflow worker due to bannable error (policy is stop_loop)" - logger.warning(f"Banning account '{account_to_ban}'. Reason: {reason}") - client.banAccount(accountId=account_to_ban, reason=reason) - logger.info(f"Successfully sent request to ban account '{account_to_ban}'.") - except Exception as e: - logger.error(f"Failed to issue ban for account '{account_to_ban}': {e}", exc_info=True) - # Log error, but continue to fail the task. - finally: - if transport and transport.isOpen(): - transport.close() - - # Intentionally fail the task to stop the DAG run as per policy. - reason = "Bannable error detected, policy is stop_loop." - logger.warning(f"INTENTIONAL FAILURE: This task is now failing itself as per the 'stop_loop' policy. Reason: {reason}") - raise AirflowException(f"Failing task as per policy. Reason: {reason}") - - -def ban_second_account_and_proxy_callable(**context): - """Bans the second account and the proxy used in the failed retry, then fails the task.""" - ti = context['task_instance'] - params = context['params'] - - account_to_ban = ti.xcom_pull(task_ids='acquire_token_with_retry.assign_new_account_for_retry', key='account_id') - error_details = ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='error_details') - proxy_to_ban = error_details.get('proxy_url') if error_details else None - - if not account_to_ban and not proxy_to_ban: - logger.warning("Could not find an account or proxy to ban from the failed retry. Nothing to do.") - # Still fail the task to stop the DAG. - raise AirflowException("Token acquisition failed on retry, but no resources found to ban.") - - client, transport = None, None - try: - host = params['service_ip'] - port = int(params['service_port']) - timeout = int(params.get('timeout', DEFAULT_TIMEOUT)) - client, transport = _get_thrift_client(host, port, timeout) - - # Ban the second account - if account_to_ban: - reason = "Banned by Airflow worker due to failure on retry attempt" - logger.warning(f"Banning account '{account_to_ban}'. Reason: {reason}") - try: - client.banAccount(accountId=account_to_ban, reason=reason) - logger.info(f"Successfully sent request to ban account '{account_to_ban}'.") - except Exception as e: - logger.error(f"Failed to issue ban for account '{account_to_ban}': {e}", exc_info=True) - - # Ban the proxy - if proxy_to_ban: - server_identity = params.get('machine_id') or socket.gethostname() - logger.warning(f"Banning proxy '{proxy_to_ban}' for server '{server_identity}'.") - try: - client.banProxy(proxyUrl=proxy_to_ban, serverIdentity=server_identity) - logger.info(f"Successfully sent request to ban proxy '{proxy_to_ban}'.") - except Exception as e: - logger.error(f"Failed to issue ban for proxy '{proxy_to_ban}': {e}", exc_info=True) - - except Exception as e: - logger.error(f"An error occurred while trying to connect to the Thrift service to ban resources: {e}", exc_info=True) - # Log the error but continue to the failure exception, as this is a best-effort cleanup. - finally: - if transport and transport.isOpen(): - transport.close() - - # After attempting to ban, we must fail this task to fail the group. - logger.warning("INTENTIONAL FAILURE: This task is now failing itself to correctly signal the end of the retry process and stop the worker lane. The second account and/or proxy have been banned.") - raise AirflowException("Token acquisition failed on retry. Banned second account and proxy.") - - -def trigger_self_run_callable(**context): - """Triggers a new run of this same DAG to continue the processing loop, with an optional delay.""" - ti = context['task_instance'] - params = context['params'] - dag_run = context['dag_run'] - - # Check if this was triggered due to an empty queue to apply the specific delay. - pull_task_instance = dag_run.get_task_instance(task_id='pull_url_and_assign_account') - is_empty_queue_scenario = pull_task_instance and pull_task_instance.state == 'skipped' - - delay = 0 - if is_empty_queue_scenario: - # Use the specific delay for empty queues. Default to 60s. - delay = params.get('retrigger_delay_on_empty_s', 60) - logger.info(f"Queue was empty. Applying delay of {delay}s before re-triggering.") - else: - # For successful runs, re-trigger immediately by default. - logger.info("Worker finished successfully. Triggering next run of itself to continue the loop.") - delay = 0 # Immediate re-trigger on success. - - if delay > 0: - logger.info(f"Waiting for {delay}s before triggering next run.") - time.sleep(delay) - logger.info(f"Finished waiting {delay}s. Proceeding to trigger next run.") - - # Generate a unique run_id for the new worker run - run_id = f"self_triggered_{datetime.utcnow().isoformat()}" - - # Pass through all original parameters to the new run. - conf_to_pass = {k: v for k, v in params.items() if v is not None} - - # The new run will pull its own URL, so we ensure 'url' is not passed. - if 'url' in conf_to_pass: - del conf_to_pass['url'] - - # Pass the successful account ID to the next run for affinity. - # It could come from the first attempt or the retry. - successful_account_ids = ti.xcom_pull(task_ids=['acquire_token_with_retry.get_token', 'acquire_token_with_retry.retry_get_token'], key='successful_account_id') - successful_account_id = next((acc for acc in successful_account_ids if acc), None) - - if successful_account_id: - conf_to_pass['current_account_id'] = successful_account_id - logger.info(f"Passing successful account '{successful_account_id}' to the next worker run for affinity.") - else: - # If no account was successful (e.g., empty queue scenario), don't pass one. - # The next run will pick a new one. - conf_to_pass['current_account_id'] = None - logger.info("No successful account ID found. Next worker will select a new account from the pool.") - - logger.info(f"Triggering 'ytdlp_ops_worker_per_url' with run_id '{run_id}' and conf: {conf_to_pass}") - - trigger_dag( - dag_id='ytdlp_ops_worker_per_url', # Trigger itself - run_id=run_id, - conf=conf_to_pass, - replace_microseconds=False - ) - logger.info("Successfully triggered the next worker run.") - - -# ============================================================================= -# DAG Definition -# ============================================================================= - -default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 0, - 'retry_delay': timedelta(minutes=1), - 'start_date': days_ago(1), - 'queue': 'queue-dl002', -} - -with DAG( - dag_id='ytdlp_ops_worker_per_url', - default_args=default_args, - schedule_interval=None, - catchup=False, - description='Self-sustaining worker DAG that processes URLs from a Redis queue in a continuous loop.', - doc_md=""" - ### YT-DLP Self-Sustaining Worker - - This DAG is a self-sustaining worker that processes URLs in a continuous loop. - It is started by the `ytdlp_ops_orchestrator` (the "ignition system"). - - #### How it Works: - - 1. **Ignition:** An initial run is triggered by the orchestrator. - 2. **Pull & Assign:** It pulls a URL from Redis and assigns an account for the job, reusing the last successful account if available (affinity). - 3. **Get Token:** It calls the `ytdlp-ops-server` to get tokens and `info.json`. This step is encapsulated in a `TaskGroup` that handles a single retry on failure. - 4. **Failure Handling:** If `get_token` fails with a "bannable" error (like bot detection), it follows the `on_bannable_failure` policy: - - `retry_with_new_account` (default): It bans the failing account, picks a new one, and retries the `get_token` call once. If the retry also fails, it bans the second account and the proxy, then stops the loop. - - `stop_loop`: It bans the account and stops the loop immediately. - 5. **Download:** If tokens are retrieved successfully, it downloads the media. - 6. **Continue or Stop:** After success, or a non-recoverable failure, it decides whether to continue the loop by re-triggering itself or to stop. - - This creates a "processing lane" that runs independently until the queue is empty or a failure occurs. - """, - tags=['ytdlp', 'worker'], - params={ - # Worker loop control params (passed from orchestrator) - 'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="Base name for Redis queues."), - 'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."), - # Worker-specific params - 'service_ip': Param(DEFAULT_YT_AUTH_SERVICE_IP, type="string", description="Service IP. Default is from Airflow variable YT_AUTH_SERVICE_IP or hardcoded."), - 'service_port': Param(DEFAULT_YT_AUTH_SERVICE_PORT, type="integer", description="Port of the Envoy load balancer. Default is from Airflow variable YT_AUTH_SERVICE_PORT or hardcoded."), - 'account_pool': Param('default_account', type="string", description="Account pool prefix or comma-separated list."), - 'account_pool_size': Param(None, type=["integer", "null"], description="If using a prefix for 'account_pool', this specifies the number of accounts to generate (e.g., 10 for 'prefix_01' through 'prefix_10'). Required when using a prefix."), - 'machine_id': Param(None, type=["string", "null"], description="Identifier for the client machine, used for proxy usage tracking. If not set, worker hostname will be used."), - 'clients': Param('mweb', type="string", description="Comma-separated list of clients to use for token generation (e.g., 'ios,android,mweb')."), - 'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."), - 'download_format': Param('ba[ext=m4a]/bestaudio/best', type="string", description="yt-dlp format selection string."), - 'output_path_template': Param("%(title)s [%(id)s].%(ext)s", type="string", description="yt-dlp output filename template."), - 'on_bannable_failure': Param( - 'retry_with_new_account', - type="string", - enum=['stop_loop', 'retry_with_new_account'], - title="On Bannable Failure Policy", - description="Policy for when a bannable error occurs. 'stop_loop' or 'retry_with_new_account'." - ), - 'retry_on_probe_failure': Param(False, type="boolean", description="If True, attempts to re-download and probe a file if the initial probe fails."), - 'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean", description="If True and all accounts in a prefix-based pool are exhausted, create a new one automatically."), - 'retrigger_delay_on_empty_s': Param(60, type="integer", description="Delay in seconds before re-triggering a worker if the queue is empty. Set to -1 to stop the loop."), - # --- Internal Worker Parameters (for self-triggering loop) --- - 'current_account_id': Param(None, type=["string", "null"], description="[Internal] The account ID used by the previous run in this worker lane. Used to maintain account affinity."), - } -) as dag: - - pull_url_and_assign_account = PythonOperator( - task_id='pull_url_and_assign_account', - python_callable=pull_url_and_assign_account_callable, - ) - - # --- Encapsulate token acquisition logic in a TaskGroup for visual clarity --- - with TaskGroup(group_id='acquire_token_with_retry') as acquire_token_group: - get_token = PythonOperator( - task_id='get_token', - python_callable=get_token_callable, - templates_dict={'info_json_dir': "{{ dag_run.conf.get('info_json_dir', var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')) }}"}, - ) - - handle_bannable_error_branch = BranchPythonOperator( - task_id='handle_bannable_error_branch', - python_callable=handle_bannable_error_branch_callable, - trigger_rule='one_failed', # This task should only run if get_token fails - ) - - # --- Retry Path --- - ban_account_and_prepare_for_retry = PythonOperator( - task_id='ban_account_and_prepare_for_retry', - python_callable=ban_first_account_callable, - ) - - assign_new_account_for_retry = PythonOperator( - task_id='assign_new_account_for_retry', - python_callable=assign_new_account_for_retry_callable, - ) - - retry_get_token = PythonOperator( - task_id='retry_get_token', - python_callable=get_token_callable, - templates_dict={'info_json_dir': "{{ dag_run.conf.get('info_json_dir', var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')) }}"}, - ) - - handle_retry_failure_branch = BranchPythonOperator( - task_id='handle_retry_failure_branch', - python_callable=handle_retry_failure_branch_callable, - trigger_rule='one_failed', # This task should only run if retry_get_token fails - ) - - ban_second_account_and_proxy = PythonOperator( - task_id='ban_second_account_and_proxy', - python_callable=ban_second_account_and_proxy_callable, - ) - - # --- Stop Path --- - ban_account_and_fail = PythonOperator( - task_id='ban_account_and_fail', - python_callable=ban_first_account_and_fail_callable, - ) - - # --- Internal Success Merge Point --- - token_acquisition_succeeded = DummyOperator( - task_id='token_acquisition_succeeded', - trigger_rule='one_success', - ) - - # --- Define dependencies within the TaskGroup --- - # The success dummy task is the merge point for the two possible success tasks. - [get_token, retry_get_token] >> token_acquisition_succeeded - - # The first branch operator runs only if get_token fails. - get_token >> handle_bannable_error_branch - # It branches to the retry path or the hard-fail path. - handle_bannable_error_branch >> [ban_account_and_prepare_for_retry, ban_account_and_fail] - - # The retry path - ban_account_and_prepare_for_retry >> assign_new_account_for_retry >> retry_get_token - - # The second branch operator runs only if retry_get_token fails. - retry_get_token >> handle_retry_failure_branch - # It only branches to the final failure task. - handle_retry_failure_branch >> ban_second_account_and_proxy - - # --- Main Execution Path (outside the TaskGroup) --- - download_and_probe = BashOperator( - task_id='download_and_probe', - bash_command=""" - set -e - - INFO_JSON_PATH_1="{{ ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='info_json_path') }}" - INFO_JSON_PATH_2="{{ ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='info_json_path') }}" - INFO_JSON_PATH="${INFO_JSON_PATH_1:-$INFO_JSON_PATH_2}" - - PROXY_1="{{ ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='socks_proxy') }}" - PROXY_2="{{ ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='socks_proxy') }}" - PROXY="${PROXY_1:-$PROXY_2}" - - FORMAT="{{ params.download_format }}" - DOWNLOAD_DIR="{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles/video') }}" - FILENAME_TEMPLATE="{{ params.output_path_template }}" - FULL_OUTPUT_PATH="$DOWNLOAD_DIR/$FILENAME_TEMPLATE" - - echo "--- Starting Download Step ---" - echo "Info JSON Path: $INFO_JSON_PATH" - echo "Proxy: $PROXY" - echo "Format: $FORMAT" - echo "Download Directory: $DOWNLOAD_DIR" - echo "Full Output Path: $FULL_OUTPUT_PATH" - - if [ -z "$INFO_JSON_PATH" ] || [ "$INFO_JSON_PATH" == "None" ] || [ ! -f "$INFO_JSON_PATH" ]; then - echo "Error: info.json path is missing or file does not exist ($INFO_JSON_PATH)." - exit 1 - fi - - CMD_ARRAY=(yt-dlp --load-info-json "$INFO_JSON_PATH") - if [ -n "$PROXY" ] && [ "$PROXY" != "None" ]; then - CMD_ARRAY+=(--proxy "$PROXY") - fi - CMD_ARRAY+=(-f "$FORMAT" -o "$FULL_OUTPUT_PATH" --print filename) - CMD_ARRAY+=(--continue --no-progress --no-simulate --no-write-info-json --ignore-errors --no-playlist) - - echo "Executing: $(printf "%q " "${CMD_ARRAY[@]}")" - - FINAL_FILENAME=$("${CMD_ARRAY[@]}") - EXIT_CODE=$? - - echo "yt-dlp exited with code: $EXIT_CODE" - - if [ $EXIT_CODE -ne 0 ]; then - echo "Error: yt-dlp command failed." - exit $EXIT_CODE - fi - if [ -z "$FINAL_FILENAME" ] || [ ! -f "$FINAL_FILENAME" ]; then - echo "Error: Download failed or did not produce a file." - exit 1 - fi - echo "SUCCESS: Download complete. Final file at: $FINAL_FILENAME" - - echo "--- Starting Probe Step ---" - echo "Probing downloaded file: $FINAL_FILENAME" - if ! ffmpeg -v error -i "$FINAL_FILENAME" -f null - ; then - echo "Error: ffmpeg probe check failed for '$FINAL_FILENAME'. The file might be corrupt." - - if [ "{{ params.retry_on_probe_failure }}" == "True" ]; then - echo "Attempting one retry on probe failure..." - echo "Renaming to .part to attempt resuming download." - mv -f "$FINAL_FILENAME" "$FINAL_FILENAME.part" - - # Re-run download command - echo "Re-executing: $(printf "%q " "${CMD_ARRAY[@]}")" - FINAL_FILENAME=$("${CMD_ARRAY[@]}") - EXIT_CODE=$? - echo "yt-dlp retry exited with code: $EXIT_CODE" - - if [ $EXIT_CODE -ne 0 ]; then - echo "Error: yt-dlp retry command failed." - exit $EXIT_CODE - fi - if [ -z "$FINAL_FILENAME" ] || [ ! -f "$FINAL_FILENAME" ]; then - echo "Error: Retry download failed or did not produce a file." - exit 1 - fi - echo "SUCCESS: Retry download complete. Final file at: $FINAL_FILENAME" - - # Re-probe - echo "Probing redownloaded file: $FINAL_FILENAME" - if ! ffmpeg -v error -i "$FINAL_FILENAME" -f null - ; then - echo "Error: ffmpeg probe check failed again for '$FINAL_FILENAME'. Failing with exit code 2." - exit 2 - fi - else - echo "Failing with exit code 2 due to probe failure (retries disabled)." - exit 2 - fi - fi - echo "SUCCESS: Probe confirmed valid media file." - - # Push the final filename for the success_task - echo "$FINAL_FILENAME" - """, - retries=0, - retry_delay=timedelta(minutes=1), - ) - - # --- Finalization Tasks --- - mark_url_as_success = PythonOperator( - task_id='mark_url_as_success', - python_callable=mark_url_as_success, - ) - - handle_generic_failure = PythonOperator( - task_id='handle_generic_failure', - python_callable=handle_failure_callable, - trigger_rule='one_failed', # Trigger if any upstream in the failure path fails - ) - - decide_next_step = BranchPythonOperator( - task_id='decide_what_to_do_next', - python_callable=decide_what_to_do_next_callable, - trigger_rule='all_done', - ) - - continue_loop_and_trigger_next_run = PythonOperator( - task_id='continue_loop_and_trigger_next_run', - python_callable=trigger_self_run_callable, - ) - - stop_worker_lane_gracefully = DummyOperator(task_id='stop_worker_lane_gracefully') - mark_dag_run_as_failed = BashOperator(task_id='mark_dag_run_as_failed', bash_command='exit 1') - - # --- Define Task Dependencies --- - pull_url_and_assign_account >> acquire_token_group - - # The TaskGroup's internal success task (`token_acquisition_succeeded`) is the trigger for the download. - # This is more explicit than depending on the entire group's state and prevents the skip issue. - dag.get_task('acquire_token_with_retry.token_acquisition_succeeded') >> download_and_probe - - download_and_probe >> mark_url_as_success - - # Define the failure path. The generic failure handler is set downstream of the two - # main tasks that can fail. Its 'one_failed' trigger rule ensures it only runs on failure. - # This explicit list avoids potential scheduler ambiguity. - [acquire_token_group, download_and_probe] >> handle_generic_failure - - # Define the final decision point. This task must run after the success path completes - # OR after the failure path completes. Its 'all_done' trigger rule makes this possible. - mark_url_as_success >> decide_next_step - handle_generic_failure >> decide_next_step - - decide_next_step >> [continue_loop_and_trigger_next_run, stop_worker_lane_gracefully, mark_dag_run_as_failed] \ No newline at end of file diff --git a/airflow/dags-backup/proxy-checker.py b/airflow/dags-backup/proxy-checker.py deleted file mode 100644 index 0958f2c..0000000 --- a/airflow/dags-backup/proxy-checker.py +++ /dev/null @@ -1,736 +0,0 @@ -import sys -import os -import time -import csv -import json -import logging -import requests -from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import List, Optional, Dict, Callable, Union -from threading import Event - -from PyQt6.QtCore import Qt, QThread, pyqtSignal, QObject, QTimer -from PyQt6.QtWidgets import ( - QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, - QLabel, QLineEdit, QPushButton, QTextEdit, QSpinBox, QDoubleSpinBox, - QCheckBox, QGroupBox, QGridLayout, QMessageBox, QProgressBar, QDialog, - QComboBox, QFileDialog -) - -# Define the current version of this tool. -CURRENT_VERSION = "1.3.0" - -class ProxyChecker: - """ - Fetches proxy lists from given URLs and checks if they work. - Supports cancellation, pause/resume, progress reporting, and collects optional detailed - response times, anonymity classification, and geo-location details for working proxies. - """ - def __init__(self, - proxy_urls: Dict[str, str], - timeout: int = 1, - max_retries: int = 3, - retry_delay: float = 1.0, - max_workers: int = 20, - check_url: str = "http://www.google.com", - detailed_results: bool = False, - export_format: str = "txt", # or "csv" or "json" - user_agent: Optional[str] = None, - log_callback: Optional[Callable[[str], None]] = None, - progress_callback: Optional[Callable[[int], None]] = None): - self.proxy_urls = proxy_urls - self.timeout = timeout - self.max_retries = max_retries - self.retry_delay = retry_delay - self.max_workers = max_workers - self.check_url = check_url - self.detailed_results = detailed_results - self.export_format = export_format.lower() - self.user_agent = user_agent - self.log_callback = log_callback - self.progress_callback = progress_callback - self.cancel_event = Event() - self.pause_event = Event() # When set, processing is paused - - # Statistics counters - self.total_proxies_checked = 0 - self.working_proxies_found = 0 - self.overall_total_count = 0 - self.overall_processed_count = 0 - - # Store detailed working results by type. - self.working_results: Dict[str, List[Union[str, Dict[str, Union[str, float, dict]]]]] = {} - - self.session = requests.Session() - if self.user_agent: - self.session.headers["User-Agent"] = self.user_agent - - # Determine the client IP to help with anonymity detection. - try: - r = requests.get("https://api.ipify.org?format=json", timeout=3) - r.raise_for_status() - self.client_ip = r.json().get("ip") - self.log("info", f"Client IP determined as {self.client_ip}") - except requests.RequestException: - self.client_ip = "unknown" - self.log("warning", "Could not determine client IP for anonymity detection.") - - def log(self, level: str, message: str) -> None: - full_message = f"{level.upper()}: {message}" - if self.log_callback: - self.log_callback(full_message) - else: - print(full_message) - - def cancel(self) -> None: - self.cancel_event.set() - self.log("info", "Cancellation requested.") - - def pause(self) -> None: - self.pause_event.set() - self.log("info", "Proxy checking paused.") - - def resume(self) -> None: - self.pause_event.clear() - self.log("info", "Proxy checking resumed.") - - def determine_anonymity(self, proxy: str) -> str: - try: - session = requests.Session() - session.proxies = {'http': proxy, 'https': proxy} - r = session.get("https://api.ipify.org?format=json", timeout=self.timeout) - r.raise_for_status() - proxy_ip = r.json().get("ip") - return "transparent" if proxy_ip == self.client_ip else "anonymous" - except requests.RequestException: - return "unknown" - - def get_geo_info(self, ip: str) -> dict: - try: - r = requests.get(f"http://ip-api.com/json/{ip}", timeout=3) - r.raise_for_status() - return r.json() - except requests.RequestException: - return {} - - def check_proxy(self, proxy: str) -> Optional[Union[str, dict]]: - if self.cancel_event.is_set(): - return None - # If paused, wait until resumed. - while self.pause_event.is_set(): - time.sleep(0.1) - try: - start = time.time() - session = requests.Session() - session.proxies = {'http': proxy, 'https': proxy} - if self.user_agent: - session.headers["User-Agent"] = self.user_agent - response = session.get(self.check_url, timeout=self.timeout) - elapsed = time.time() - start - if response.status_code == 200: - if self.detailed_results: - anonymity = self.determine_anonymity(proxy) - ip_only = proxy.split(':')[0] - geo = self.get_geo_info(ip_only) - return { - "proxy": proxy, - "response_time": elapsed, - "anonymity": anonymity, - "geo": geo - } - else: - return proxy - except requests.RequestException: - return None - - def get_proxies(self, url: str) -> List[str]: - for attempt in range(self.max_retries): - if self.cancel_event.is_set(): - self.log("info", "Cancellation detected while fetching proxies.") - return [] - try: - response = self.session.get(url, timeout=self.timeout) - response.raise_for_status() - self.log("info", f"Successfully fetched proxies from {url}") - return response.text.strip().splitlines() - except requests.RequestException as e: - self.log("warning", f"Attempt {attempt + 1} failed for {url}: {e}") - time.sleep(self.retry_delay) - self.log("error", f"Failed to retrieve proxies from {url} after {self.max_retries} attempts.") - return [] - - @staticmethod - def create_proxy_dir(directory: str) -> None: - os.makedirs(directory, exist_ok=True) - - def process_proxies(self, - proxy_type: str, - url: Optional[str] = None, - proxies: Optional[List[str]] = None) -> int: - if proxies is None and url is not None: - proxies = self.get_proxies(url) - if self.cancel_event.is_set(): - self.log("info", "Cancellation detected before processing proxies.") - return 0 - if not proxies: - self.log("warning", f"No proxies to check for {proxy_type}") - return 0 - - total_proxies = len(proxies) - self.log("info", f"Checking {total_proxies} {proxy_type} proxies with {self.max_workers} workers.") - working_proxy_list = [] - with ThreadPoolExecutor(max_workers=self.max_workers) as executor: - futures = {executor.submit(self.check_proxy, proxy): proxy for proxy in proxies} - for future in as_completed(futures): - while self.pause_event.is_set(): - time.sleep(0.1) - if self.cancel_event.is_set(): - self.log("info", "Cancellation detected during proxy checking loop.") - break - result = future.result() - self.overall_processed_count += 1 - if self.progress_callback and self.overall_total_count > 0: - progress_percent = int((self.overall_processed_count / self.overall_total_count) * 100) - self.progress_callback(progress_percent) - if result: - working_proxy_list.append(result) - - self.working_results[proxy_type] = working_proxy_list - file_ext = ".csv" if self.export_format == "csv" else ".json" if self.export_format == "json" else ".txt" - proxy_file = f'proxies/{proxy_type}{file_ext}' - self.create_proxy_dir(os.path.dirname(proxy_file)) - try: - if self.export_format == "csv": - with open(proxy_file, 'w', newline='') as f: - if self.detailed_results: - writer = csv.writer(f) - writer.writerow(["Proxy", "Response Time (s)", "Anonymity", "Country", "Region", "City"]) - for item in working_proxy_list: - geo = item.get("geo", {}) - writer.writerow([ - item.get("proxy"), - f"{item.get('response_time', 0):.2f}", - item.get("anonymity"), - geo.get("country", ""), - geo.get("regionName", ""), - geo.get("city", "") - ]) - else: - writer = csv.writer(f) - writer.writerow(["Proxy"]) - for item in working_proxy_list: - writer.writerow([item]) - elif self.export_format == "json": - with open(proxy_file, 'w') as f: - json.dump(working_proxy_list, f, indent=4) - else: - with open(proxy_file, 'w') as f: - if self.detailed_results: - lines = [ - f"{item.get('proxy')} - {item.get('response_time'):.2f} s - {item.get('anonymity')} - {item.get('geo', {}).get('country', '')}" - for item in working_proxy_list - ] - else: - lines = working_proxy_list - f.write('\n'.join(lines) + '\n') - except OSError as e: - self.log("error", f"Failed to write working proxies to {proxy_file}: {e}") - - self.log("info", f"Checked {total_proxies} {proxy_type} proxies. Working: {len(working_proxy_list)}.") - self.total_proxies_checked += total_proxies - self.working_proxies_found += len(working_proxy_list) - return len(working_proxy_list) - - def get_statistics(self) -> str: - stats = f"Total proxies checked: {self.total_proxies_checked}\n" - stats += f"Working proxies found: {self.working_proxies_found}\n" - if self.detailed_results: - all_times = [] - for lst in self.working_results.values(): - all_times.extend([item.get("response_time") for item in lst if isinstance(item, dict)]) - if all_times: - avg_time = sum(all_times) / len(all_times) - stats += f"Average response time: {avg_time:.2f} seconds\n" - return stats - - def run(self) -> None: - start_time = time.time() - self.overall_total_count = 0 - self.overall_processed_count = 0 - proxies_by_type: Dict[str, List[str]] = {} - - for proxy_type, url in self.proxy_urls.items(): - if self.cancel_event.is_set(): - self.log("info", "Cancellation detected. Aborting processing.") - return - proxies = self.get_proxies(url) - proxies_by_type[proxy_type] = proxies - self.overall_total_count += len(proxies) - - if self.overall_total_count == 0: - self.log("warning", "No proxies fetched from any source.") - - for proxy_type, proxies in proxies_by_type.items(): - if self.cancel_event.is_set(): - self.log("info", "Cancellation detected. Aborting further processing.") - break - self.process_proxies(proxy_type, proxies=proxies) - - self.session.close() - end_time = time.time() - minutes, seconds = divmod(end_time - start_time, 60) - self.log("info", f"Total proxies checked: {self.total_proxies_checked}. Working proxies: {self.working_proxies_found}.") - self.log("info", f"Execution time: {int(minutes)} minutes {int(seconds)} seconds.") - self.log("info", "Statistics:\n" + self.get_statistics()) - # Append history log - try: - with open("history.log", "a") as hist_file: - hist_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {self.get_statistics()}\n") - except OSError as e: - self.log("error", f"Failed to write history log: {e}") - -class ProxyCheckerWorker(QObject): - """ - Worker class to run the proxy checking process in a separate thread. - Emits log messages, progress updates, and a finished signal. - """ - log_signal = pyqtSignal(str) - progress_update = pyqtSignal(int) - finished = pyqtSignal() - - def __init__(self, - proxy_urls: Dict[str, str], - timeout: int, - max_retries: int, - retry_delay: float, - max_workers: int, - check_url: str, - detailed_results: bool, - export_format: str, - user_agent: Optional[str] = None): - super().__init__() - self.proxy_urls = proxy_urls - self.timeout = timeout - self.max_retries = max_retries - self.retry_delay = retry_delay - self.max_workers = max_workers - self.check_url = check_url - self.detailed_results = detailed_results - self.export_format = export_format - self.user_agent = user_agent - self.checker: Optional[ProxyChecker] = None - - def log_callback(self, message: str) -> None: - self.log_signal.emit(message) - - def progress_callback(self, progress: int) -> None: - self.progress_update.emit(progress) - - def cancel(self) -> None: - if self.checker is not None: - self.checker.cancel() - - def run(self) -> None: - self.checker = ProxyChecker( - proxy_urls=self.proxy_urls, - timeout=self.timeout, - max_retries=self.max_retries, - retry_delay=self.retry_delay, - max_workers=self.max_workers, - check_url=self.check_url, - detailed_results=self.detailed_results, - export_format=self.export_format, - user_agent=self.user_agent, - log_callback=self.log_callback, - progress_callback=self.progress_callback - ) - self.log_callback("Starting proxy checking...") - self.checker.run() - self.log_callback("Proxy checking finished.") - self.finished.emit() - -class UpdateChecker(QObject): - """ - Worker class to check for software updates. - """ - update_checked = pyqtSignal(str) - - def run(self) -> None: - try: - response = requests.get("https://api.github.com/repos/Jesewe/proxy-checker/releases/latest", timeout=5) - response.raise_for_status() - data = response.json() - latest_version = data["tag_name"].lstrip("v") - if latest_version != CURRENT_VERSION: - msg = (f"New version available: {latest_version}.\n" - f"You are using version {CURRENT_VERSION}.\n" - f"Visit {data['html_url']} to download the update.") - else: - msg = f"You are up-to-date with version {CURRENT_VERSION}." - except Exception as e: - msg = f"Failed to check for updates: {e}" - self.update_checked.emit(msg) - -class MainWindow(QMainWindow): - def __init__(self): - super().__init__() - self.setWindowTitle("Proxy Checker") - self.setGeometry(100, 100, 850, 750) - self.init_ui() - self.thread: Optional[QThread] = None - self.worker: Optional[ProxyCheckerWorker] = None - self.update_thread: Optional[QThread] = None - self.last_checker: Optional[ProxyChecker] = None - self.is_paused = False - - def init_ui(self): - main_widget = QWidget() - main_layout = QVBoxLayout() - - # Configuration group - config_group = QGroupBox("Settings") - config_layout = QGridLayout() - - # Timeout - config_layout.addWidget(QLabel("Timeout (s):"), 0, 0) - self.timeout_spin = QSpinBox() - self.timeout_spin.setRange(1, 60) - self.timeout_spin.setValue(3) - config_layout.addWidget(self.timeout_spin, 0, 1) - - # Max Retries - config_layout.addWidget(QLabel("Max Retries:"), 0, 2) - self.retries_spin = QSpinBox() - self.retries_spin.setRange(1, 10) - self.retries_spin.setValue(3) - config_layout.addWidget(self.retries_spin, 0, 3) - - # Retry Delay - config_layout.addWidget(QLabel("Retry Delay (s):"), 1, 0) - self.retry_delay_spin = QDoubleSpinBox() - self.retry_delay_spin.setRange(0.1, 10.0) - self.retry_delay_spin.setSingleStep(0.1) - self.retry_delay_spin.setValue(1.0) - config_layout.addWidget(self.retry_delay_spin, 1, 1) - - # Max Workers - config_layout.addWidget(QLabel("Max Workers:"), 1, 2) - self.workers_spin = QSpinBox() - self.workers_spin.setRange(1, 200) - self.workers_spin.setValue(50) - config_layout.addWidget(self.workers_spin, 1, 3) - - # Test URL - config_layout.addWidget(QLabel("Test URL:"), 2, 0) - self.test_url_edit = QLineEdit("http://www.google.com") - config_layout.addWidget(self.test_url_edit, 2, 1, 1, 3) - - # Custom User-Agent - config_layout.addWidget(QLabel("Custom User-Agent:"), 3, 0) - self.user_agent_edit = QLineEdit("") - self.user_agent_edit.setPlaceholderText("Leave blank for default") - config_layout.addWidget(self.user_agent_edit, 3, 1, 1, 3) - - # Detailed Results Option - self.detailed_checkbox = QCheckBox("Detailed Results (Include Response Time, Anonymity & Geo)") - config_layout.addWidget(self.detailed_checkbox, 4, 0, 1, 2) - - # Export Format Option - config_layout.addWidget(QLabel("Export Format:"), 4, 2) - self.export_format_combo = QComboBox() - self.export_format_combo.addItems(["txt", "csv", "json"]) - config_layout.addWidget(self.export_format_combo, 4, 3) - - config_group.setLayout(config_layout) - main_layout.addWidget(config_group) - - # Proxy Sources Group - proxy_group = QGroupBox("Proxy Sources") - proxy_layout = QGridLayout() - self.proxy_urls = { - "http": "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt", - "socks4": "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt", - "socks5": "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt" - } - self.proxy_type_checkboxes = {} - self.proxy_url_edits = {} - row = 0 - for proxy_type, url in self.proxy_urls.items(): - checkbox = QCheckBox(proxy_type) - checkbox.setChecked(True) - self.proxy_type_checkboxes[proxy_type] = checkbox - proxy_layout.addWidget(checkbox, row, 0) - url_edit = QLineEdit(url) - self.proxy_url_edits[proxy_type] = url_edit - proxy_layout.addWidget(url_edit, row, 1) - row += 1 - proxy_group.setLayout(proxy_layout) - main_layout.addWidget(proxy_group) - - # Progress Bar - self.progress_bar = QProgressBar() - self.progress_bar.setRange(0, 100) - self.progress_bar.setValue(0) - main_layout.addWidget(self.progress_bar) - - # Main Buttons - btn_layout = QHBoxLayout() - self.start_btn = QPushButton("Start Checking") - self.start_btn.clicked.connect(self.start_checking) - btn_layout.addWidget(self.start_btn) - - self.pause_btn = QPushButton("Pause") - self.pause_btn.setEnabled(False) - self.pause_btn.clicked.connect(self.toggle_pause) - btn_layout.addWidget(self.pause_btn) - - self.cancel_btn = QPushButton("Cancel") - self.cancel_btn.setEnabled(False) - self.cancel_btn.clicked.connect(self.cancel_checking) - btn_layout.addWidget(self.cancel_btn) - - self.show_results_btn = QPushButton("Show Results") - self.show_results_btn.setEnabled(False) - self.show_results_btn.clicked.connect(self.show_results) - btn_layout.addWidget(self.show_results_btn) - main_layout.addLayout(btn_layout) - - # Extra Buttons: Show Statistics, Save Log - extra_btn_layout = QHBoxLayout() - self.show_stats_btn = QPushButton("Show Statistics") - self.show_stats_btn.setEnabled(False) - self.show_stats_btn.clicked.connect(self.show_statistics) - extra_btn_layout.addWidget(self.show_stats_btn) - - self.save_log_btn = QPushButton("Save Log") - self.save_log_btn.clicked.connect(self.save_log) - extra_btn_layout.addWidget(self.save_log_btn) - main_layout.addLayout(extra_btn_layout) - - # Log Text Area - self.log_text = QTextEdit() - self.log_text.setReadOnly(True) - self.log_text.setStyleSheet("background-color: #1e1e1e; color: #d4d4d4; font-family: Consolas; font-size: 12pt;") - main_layout.addWidget(self.log_text) - - main_widget.setLayout(main_layout) - self.setCentralWidget(main_widget) - - def start_checking(self): - self.start_btn.setEnabled(False) - self.cancel_btn.setEnabled(True) - self.pause_btn.setEnabled(True) - self.show_results_btn.setEnabled(False) - self.show_stats_btn.setEnabled(False) - self.progress_bar.setValue(0) - self.log_text.clear() - - # Build proxy_urls from selected checkboxes. - selected_proxy_urls = {} - for proxy_type, checkbox in self.proxy_type_checkboxes.items(): - if checkbox.isChecked(): - url = self.proxy_url_edits[proxy_type].text().strip() - if url: - selected_proxy_urls[proxy_type] = url - - if not selected_proxy_urls: - QMessageBox.warning(self, "No Proxies Selected", "Please select at least one proxy type to check.") - self.start_btn.setEnabled(True) - self.cancel_btn.setEnabled(False) - self.pause_btn.setEnabled(False) - return - - # Get settings from UI. - timeout = self.timeout_spin.value() - max_retries = self.retries_spin.value() - retry_delay = self.retry_delay_spin.value() - max_workers = self.workers_spin.value() - check_url = self.test_url_edit.text().strip() - detailed_results = self.detailed_checkbox.isChecked() - export_format = self.export_format_combo.currentText().strip() - user_agent = self.user_agent_edit.text().strip() or None - - self.thread = QThread() - self.worker = ProxyCheckerWorker( - proxy_urls=selected_proxy_urls, - timeout=timeout, - max_retries=max_retries, - retry_delay=retry_delay, - max_workers=max_workers, - check_url=check_url, - detailed_results=detailed_results, - export_format=export_format, - user_agent=user_agent - ) - self.worker.moveToThread(self.thread) - self.worker.log_signal.connect(self.append_log) - self.worker.progress_update.connect(self.progress_bar.setValue) - self.worker.finished.connect(self.on_finished) - self.thread.started.connect(self.worker.run) - self.thread.finished.connect(self.thread.deleteLater) - self.thread.start() - - def toggle_pause(self): - if self.worker and self.worker.checker: - if not self.is_paused: - self.worker.checker.pause() - self.is_paused = True - self.pause_btn.setText("Resume") - self.append_log("Paused proxy checking.") - else: - self.worker.checker.resume() - self.is_paused = False - self.pause_btn.setText("Pause") - self.append_log("Resumed proxy checking.") - - def cancel_checking(self): - if self.worker is not None: - self.append_log("Cancel requested by user...") - self.worker.cancel() - self.cancel_btn.setEnabled(False) - - def append_log(self, message: str): - timestamp = time.strftime("%H:%M:%S") - self.log_text.append(f"[{timestamp}] {message}") - - def on_finished(self): - self.append_log("All tasks completed.") - self.start_btn.setEnabled(True) - self.cancel_btn.setEnabled(False) - self.pause_btn.setEnabled(False) - self.show_results_btn.setEnabled(True) - self.show_stats_btn.setEnabled(True) - if self.thread is not None: - self.thread.quit() - self.thread.wait() - # Save a reference to the last checker for filtering results. - if self.worker: - self.last_checker = self.worker.checker - - def show_results(self): - # If detailed results are enabled, allow filtering by response time. - if self.last_checker and self.last_checker.detailed_results: - dialog = QDialog(self) - dialog.setWindowTitle("Filtered Working Proxies") - dialog.resize(600, 500) - layout = QVBoxLayout() - - filter_layout = QHBoxLayout() - filter_layout.addWidget(QLabel("Max Response Time (s):")) - filter_spin = QDoubleSpinBox() - filter_spin.setRange(0.1, 10.0) - filter_spin.setSingleStep(0.1) - filter_spin.setValue(1.0) - filter_layout.addWidget(filter_spin) - apply_btn = QPushButton("Apply Filter") - filter_layout.addWidget(apply_btn) - layout.addLayout(filter_layout) - - result_area = QTextEdit() - result_area.setReadOnly(True) - layout.addWidget(result_area) - - def apply_filter(): - threshold = filter_spin.value() - text = "" - for ptype, results in self.last_checker.working_results.items(): - filtered = [] - for item in results: - if isinstance(item, dict) and item.get("response_time") <= threshold: - geo = item.get("geo", {}) - filtered.append(f"{item.get('proxy')} - {item.get('response_time'):.2f} s - {item.get('anonymity')} - {geo.get('country', '')}") - if filtered: - text += f"--- {ptype} ---\n" + "\n".join(filtered) + "\n\n" - result_area.setText(text if text else "No proxies match the filter criteria.") - - apply_btn.clicked.connect(apply_filter) - # Show all results initially - apply_filter() - - btn_layout = QHBoxLayout() - copy_btn = QPushButton("Copy to Clipboard") - copy_btn.clicked.connect(lambda: QApplication.clipboard().setText(result_area.toPlainText())) - btn_layout.addWidget(copy_btn) - close_btn = QPushButton("Close") - close_btn.clicked.connect(dialog.close) - btn_layout.addWidget(close_btn) - layout.addLayout(btn_layout) - - dialog.setLayout(layout) - dialog.exec() - else: - # Fallback: read the exported files from the proxies directory. - results_text = "" - proxy_dir = "proxies" - if os.path.isdir(proxy_dir): - for filename in os.listdir(proxy_dir): - filepath = os.path.join(proxy_dir, filename) - results_text += f"--- {filename} ---\n" - try: - with open(filepath, 'r') as f: - results_text += f.read() + "\n" - except OSError as e: - results_text += f"Error reading file: {e}\n" - else: - results_text = "No results found." - - dialog = QDialog(self) - dialog.setWindowTitle("Working Proxies") - dialog.resize(600, 400) - dlg_layout = QVBoxLayout() - text_area = QTextEdit() - text_area.setReadOnly(True) - text_area.setText(results_text) - dlg_layout.addWidget(text_area) - - btn_layout = QHBoxLayout() - copy_btn = QPushButton("Copy to Clipboard") - copy_btn.clicked.connect(lambda: QApplication.clipboard().setText(results_text)) - btn_layout.addWidget(copy_btn) - close_btn = QPushButton("Close") - close_btn.clicked.connect(dialog.close) - btn_layout.addWidget(close_btn) - dlg_layout.addLayout(btn_layout) - dialog.setLayout(dlg_layout) - dialog.exec() - - def show_statistics(self): - if self.worker and self.worker.checker: - stats = self.worker.checker.get_statistics() - else: - stats = "No statistics available." - QMessageBox.information(self, "Statistics", stats) - - def save_log(self): - filename, _ = QFileDialog.getSaveFileName(self, "Save Log", "", "Text Files (*.txt);;All Files (*)") - if filename: - try: - with open(filename, 'w') as f: - f.write(self.log_text.toPlainText()) - QMessageBox.information(self, "Saved", f"Log saved to {filename}") - except OSError as e: - QMessageBox.warning(self, "Error", f"Failed to save log: {e}") - - def auto_check_for_update(self): - self.update_thread = QThread() - self.update_worker = UpdateChecker() - self.update_worker.moveToThread(self.update_thread) - self.update_worker.update_checked.connect(self.show_update_message) - self.update_thread.started.connect(self.update_worker.run) - self.update_thread.start() - - def show_update_message(self, msg: str): - QMessageBox.information(self, "Update Check", msg) - self.update_thread.quit() - self.update_thread.wait() - - def showEvent(self, event): - super().showEvent(event) - QTimer.singleShot(1000, self.auto_check_for_update) - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - app = QApplication(sys.argv) - window = MainWindow() - window.show() - sys.exit(app.exec()) \ No newline at end of file diff --git a/airflow/dags-backup/ytdlp_client_dag_v2.1.py b/airflow/dags-backup/ytdlp_client_dag_v2.1.py deleted file mode 100644 index 34dcc5c..0000000 --- a/airflow/dags-backup/ytdlp_client_dag_v2.1.py +++ /dev/null @@ -1,941 +0,0 @@ -from airflow import DAG -from airflow.models import BaseOperator, Variable -from airflow.utils.decorators import apply_defaults -from airflow.hooks.base import BaseHook -from airflow.exceptions import AirflowException -from airflow.utils.dates import days_ago -from thrift.transport import TSocket, TTransport -from thrift.protocol import TBinaryProtocol -from thrift.transport.TTransport import TTransportException -from datetime import datetime, timedelta -from pangramia.yt.exceptions.ttypes import PBServiceException -import redis -import logging -import time -import socket -import json -import os -from pangramia.yt.tokens_ops import YTTokenOpService -from pangramia.yt.common.ttypes import TokenUpdateMode -from airflow.providers.redis.hooks.redis import RedisHook -from airflow.operators.python import PythonOperator -from airflow.models.param import Param -# Assuming ytdlp_utils exists in the same directory or PYTHONPATH -# from ytdlp_utils import get_info_json, is_valid_json, extract_video_id - -# Configure logging -logger = logging.getLogger(__name__) - -# Default settings (similar to ytdlp_client_dag.py) -MAX_RETRIES = 1 -RETRY_DELAY = timedelta(seconds=10) -DEFAULT_TIMEOUT = 30 - -class YtdlpOpsOperator(BaseOperator): - """ - Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections - and Redis-based discovery, retrieves tokens, saves info.json, and manages errors. - """ - template_fields = ('url', 'service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir') - - @apply_defaults - def __init__(self, url, redis_conn_id='redis_default', max_retries=3, retry_delay=10, - service_ip=None, service_port=None, redis_enabled=False, account_id=None, - save_info_json=True, info_json_dir=None, get_socks_proxy=True, - store_socks_proxy=False, timeout=DEFAULT_TIMEOUT, *args, **kwargs): - super().__init__(*args, **kwargs) - - logger.info(f"Initializing YtdlpOpsOperator with parameters: url={url}, " - f"redis_conn_id={redis_conn_id}, max_retries={max_retries}, retry_delay={retry_delay}, " - f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, " - f"account_id={account_id}, save_info_json={save_info_json}, info_json_dir={info_json_dir}, " - f"get_socks_proxy={get_socks_proxy}, store_socks_proxy={store_socks_proxy}, timeout={timeout}") - - # Validate required parameters - if not url: - raise ValueError("url is required") - - # Validate parameters based on connection mode - if redis_enabled: - if not account_id: - raise ValueError("account_id is required when redis_enabled=True") - # Use default Redis connection if not specified - if not redis_conn_id: - redis_conn_id = 'redis_default' - logger.info(f"Using default Redis connection ID: {redis_conn_id}") - else: - if not service_ip or not service_port: - raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False") - if not account_id: - logger.warning("No account_id provided for direct connection mode. Using 'default'") - account_id = 'default' # Assign default if missing in direct mode - - self.url = url - self.redis_conn_id = redis_conn_id - self.max_retries = max_retries - self.retry_delay = int(retry_delay.total_seconds() if isinstance(retry_delay, timedelta) else retry_delay) - self.service_ip = service_ip - self.service_port = service_port - self.redis_enabled = redis_enabled - self.account_id = account_id - self.save_info_json = save_info_json - self.info_json_dir = info_json_dir - self.get_socks_proxy = get_socks_proxy - self.store_socks_proxy = store_socks_proxy - self.timeout = timeout - - def execute(self, context): - logger.info("Executing YtdlpOpsOperator") - transport = None - try: - logger.info("Getting task parameters") - params = context.get('params', {}) - redis_enabled = params.get('redis_enabled', self.redis_enabled) - logger.info(f"Using redis_enabled={redis_enabled} (from {'task params' if 'redis_enabled' in params else 'operator init'})") - - # Determine account_id to use (from params or operator default) - account_id = context['params'].get('account_id', self.account_id) - logger.info(f"Using account_id='{account_id}' (from {'task params' if 'account_id' in params else 'operator init'})") - - if redis_enabled: - # Get Redis connection with proper authentication and error handling - redis_conn = BaseHook.get_connection(self.redis_conn_id) - redis_client = redis.Redis( - host=redis_conn.host, - port=redis_conn.port, - password=redis_conn.password, - db=0, - decode_responses=True # Important for consistent key handling - ) - - # Test Redis connection - try: - if not redis_client.ping(): - raise redis.exceptions.ConnectionError("Redis ping failed") - logger.info(f"Successfully connected to Redis at {redis_conn.host}:{redis_conn.port}") - except redis.exceptions.AuthenticationError: - logger.error(f"Redis authentication failed for connection '{self.redis_conn_id}'. Check password.") - raise AirflowException("Redis authentication failed.") - except redis.exceptions.ConnectionError as e: - logger.error(f"Could not connect to Redis at {redis_conn.host}:{redis_conn.port}. Error: {e}") - raise AirflowException(f"Redis connection failed: {e}") - except Exception as e: - logger.error(f"Unexpected Redis error: {str(e)}") - raise AirflowException(f"Unexpected Redis error: {e}") - - # Get service details from Redis with retries and proper key handling - service_key = f"ytdlp:{account_id}" - legacy_key = account_id # For backward compatibility - - host = None - port = None - for attempt in range(self.max_retries): - try: - logger.info(f"Attempt {attempt + 1}/{self.max_retries}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'") - service_details = redis_client.hgetall(service_key) - if not service_details: - logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'") - service_details = redis_client.hgetall(legacy_key) - - if not service_details: - raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}") - - # Find IP and port, handling potential case differences and byte/string types - ip_key = next((k for k in service_details if k.lower() == 'ip'), None) - port_key = next((k for k in service_details if k.lower() == 'port'), None) - - if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}") - if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}") - - host = service_details[ip_key] # Already decoded due to decode_responses=True - port_str = service_details[port_key] - - try: - port = int(port_str) - except ValueError: - raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}") - - logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}") - break # Success - - except Exception as e: - logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}") - if attempt == self.max_retries - 1: - logger.error("Max retries reached for fetching Redis details.") - raise AirflowException(f"Failed to get service details from Redis after {self.max_retries} attempts: {e}") - logger.info(f"Retrying in {self.retry_delay} seconds...") - time.sleep(self.retry_delay) - else: - # Direct connection: Log parameter sources - params = context.get('params', {}) - host = params.get('service_ip', self.service_ip) - host_source = 'task params' if 'service_ip' in params else 'operator init' - port_str = params.get('service_port', self.service_port) - port_source = 'task params' if 'service_port' in params else 'operator init' - url = params.get('url', self.url) - url_source = 'task params' if 'url' in params else 'operator init' - - logger.info(f"Using service_ip={host} (from {host_source})") - logger.info(f"Using service_port={port_str} (from {port_source})") - logger.info(f"Using url={url} (from {url_source})") - - if not host or not port_str: - raise ValueError("Direct connection requires service_ip and service_port") - try: - port = int(port_str) - except ValueError: - raise ValueError(f"Invalid service_port value: {port_str}") - - logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)") - - # Render and validate timeout - timeout_param = context.get('params', {}).get('timeout', self.timeout) - if isinstance(self.timeout, str) and '{{' in self.timeout: - timeout_rendered = self.render_template(self.timeout, context) - logger.info(f"Rendered timeout template: '{self.timeout}' -> '{timeout_rendered}'") - timeout_param = timeout_rendered - try: - timeout = int(timeout_param) - if timeout <= 0: raise ValueError("Timeout must be positive") - logger.info(f"Using timeout: {timeout} seconds") - except (ValueError, TypeError): - logger.warning(f"Invalid timeout value: '{timeout_param}'. Using default: {DEFAULT_TIMEOUT}") - timeout = DEFAULT_TIMEOUT - - # Create Thrift connection objects - socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4) - socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds - transport = TTransport.TFramedTransport(socket_conn) - protocol = TBinaryProtocol.TBinaryProtocol(transport) - client = YTTokenOpService.Client(protocol) - - logger.info(f"Attempting to connect to Thrift server at {host}:{port}...") - try: - transport.open() - logger.info("Successfully connected to Thrift server.") - - # Test connection with ping - try: - client.ping() - logger.info("Server ping successful.") - except Exception as e: - logger.error(f"Server ping failed: {e}") - raise AirflowException(f"Server connection test (ping) failed: {e}") - - # Get token from service with specific error handling - try: - url_param = context.get('params', {}).get('url', self.url) - logger.info(f"Requesting token for accountId='{account_id}', url='{url_param}'") - token_data = client.getOrRefreshToken( - accountId=account_id, - updateType=TokenUpdateMode.AUTO, - url=url_param - ) - logger.info("Successfully retrieved token data from service.") - except PBServiceException as e: - logger.error(f"PBServiceException occurred: Code={getattr(e, 'errorCode', 'N/A')}, Message={getattr(e, 'message', 'N/A')}") - error_code = getattr(e, 'errorCode', None) - error_msg = f"YTDLP service error: {getattr(e, 'message', str(e))}" - # Handle specific known error codes - if error_code in [ - "SOCKS5_CONNECTION_FAILED", "SOCKS5_TIMEOUT", - "SOCKS5_CONNECTION_REFUSED", "SOCKS5_CONNECTION_TIMEOUT", - "SOCKS5_HOST_NOT_FOUND", "SOCKS5_NETWORK_UNREACHABLE" - ]: - error_msg = f"SOCKS5 proxy error ({error_code}): {e.message}. Check proxy settings." - elif error_code == "BOT_DETECTION": - error_msg = f"Bot detection triggered ({error_code}): {e.message}." - suggestions = getattr(e, 'context', {}).get('suggestions', []) - if suggestions: error_msg += "\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions) - elif error_code == "NODEJS_SCRIPT_ERROR": - error_msg = f"Node.js script error ({error_code}): {e.message}." - elif error_code == "NODEJS_TIMEOUT": - error_msg = f"Node.js timeout ({error_code}): {e.message}." - # Add more specific error handling as needed - raise AirflowException(error_msg) - except TTransportException as e: - logger.error(f"Thrift transport error during getOrRefreshToken: {e}") - raise AirflowException(f"Transport error during API call: {e}") - except Exception as e: - logger.error(f"Unexpected error during getOrRefreshToken: {e}") - raise AirflowException(f"Unexpected error during API call: {e}") - - except TTransportException as e: - # Handle connection-specific transport errors - if "read 0 bytes" in str(e) or "Could not connect to" in str(e) or "Connection refused" in str(e): - logger.error(f"Connection failed to {host}:{port}. Details: {e}") - logger.error("Possible causes: Server down, firewall block, incorrect IP/port.") - raise AirflowException(f"Failed to connect to YTDLP service at {host}:{port}: {e}") - else: - logger.error(f"Thrift transport error during connection: {str(e)}") - raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}") - except Exception as e: - logger.error(f"Unexpected error during connection or ping: {str(e)}") - raise # Re-raise other unexpected errors - - # Log received token data attributes for debugging - logger.debug(f"Token data received. Attributes: {dir(token_data)}") - for attr in dir(token_data): - if not attr.startswith('__') and not callable(getattr(token_data, attr)): # Log non-callable attributes - value = getattr(token_data, attr) - if attr == 'infoJson' and value: - logger.debug(f"infoJson: {value[:50]}...") - else: - logger.debug(f"{attr}: {value}") - - info_json_path = None # Initialize info_json_path - - save_info_json_param = context['params'].get('save_info_json', self.save_info_json) - # Render if it's a string template - if isinstance(save_info_json_param, str): - save_info_json_rendered = self.render_template(save_info_json_param, context) - # Convert common string representations to boolean - save_info_json = str(save_info_json_rendered).lower() in ['true', '1', 't', 'y', 'yes'] - else: - save_info_json = bool(save_info_json_param) - - - # Save info.json if requested and valid - if self.save_info_json: - info_json = self._get_info_json(token_data) - if info_json and self._is_valid_json(info_json): - try: - # Use internal _save_info_json method which handles rendering, dir creation, logging - info_json_path = self._save_info_json(context, info_json) - if info_json_path: # Check if saving was successful - context['task_instance'].xcom_push(key='info_json_path', value=info_json_path) - logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}") - else: - # _save_info_json should log errors, push None to indicate failure - context['task_instance'].xcom_push(key='info_json_path', value=None) - logger.warning("info.json saving failed (check logs from _save_info_json), pushing None to XCom for info_json_path.") - except Exception as e: - logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True) - context['task_instance'].xcom_push(key='info_json_path', value=None) # Push None on error - elif info_json: - logger.warning("Retrieved infoJson is not valid JSON. Skipping save.") - context['task_instance'].xcom_push(key='info_json_path', value=None) - else: - logger.info("No infoJson found in token data. Skipping save.") - context['task_instance'].xcom_push(key='info_json_path', value=None) - else: - logger.info("save_info_json is False. Skipping info.json save.") - context['task_instance'].xcom_push(key='info_json_path', value=None) - - - # Extract and potentially store SOCKS proxy - socks_proxy = None - if self.get_socks_proxy: # Use instance attribute - # Check for common attribute names for proxy - proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None) - if proxy_attr: - socks_proxy = getattr(token_data, proxy_attr) - if socks_proxy: # Ensure proxy value is not empty - logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}") - if self.store_socks_proxy: # Use instance attribute - context['task_instance'].xcom_push(key='socks_proxy', value=socks_proxy) - logger.info(f"Pushed key 'socks_proxy' to XCom with value: {socks_proxy}") - else: - logger.info("SOCKS proxy extracted but not pushed to XCom (store_socks_proxy=False).") - else: - logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty. No proxy extracted.") - # Push None even if found but empty, if storing is enabled - if self.store_socks_proxy: # Use instance attribute - context['task_instance'].xcom_push(key='socks_proxy', value=None) - logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.") - else: - logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found in token data.") - # Push None if storing is enabled but attribute not found - if self.store_socks_proxy: # Use instance attribute - context['task_instance'].xcom_push(key='socks_proxy', value=None) - logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.") - else: - logger.info("get_socks_proxy is False. Skipping proxy extraction.") - # Push None if storing is enabled but extraction was skipped - if self.store_socks_proxy: # Use instance attribute - context['task_instance'].xcom_push(key='socks_proxy', value=None) - logger.info("Pushed None to XCom for 'socks_proxy' as get_socks_proxy=False.") - - - # Get the original command from the server - ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None) - if not ytdlp_cmd: - logger.error("No 'ytdlpCommand' attribute found in token data.") - raise AirflowException("Required 'ytdlpCommand' not received from service.") - - logger.info(f"Original command received from server: {ytdlp_cmd}") - - # Log example usage command (DO NOT MODIFY the original command here) - if info_json_path: - # Use double quotes for paths/proxy in example for robustness - example_cmd = f"yt-dlp --load-info-json \"{info_json_path}\"" - if socks_proxy: - example_cmd += f" --proxy \"{socks_proxy}\"" - example_cmd += " --verbose --simulate" # Add useful flags for testing - logger.info(f"\n--- Example usage with saved info.json ---") - logger.info(example_cmd) - logger.info(f"(Note: The actual command with tokens/cookies is pushed to XCom as 'ytdlp_command')") - latest_json_path = os.path.join(os.path.dirname(info_json_path), 'latest.json') - logger.info(f"(You can also use 'latest.json': {latest_json_path})") - logger.info(f"-------------------------------------------\n") - - else: - logger.info("\n--- Original command pushed to XCom ('ytdlp_command') ---") - if socks_proxy: - logger.info(f"Use the extracted proxy '{socks_proxy}' (pushed to XCom if store_socks_proxy=True) with the --proxy flag.") - logger.info("Add --verbose and --simulate flags for testing the command.") - logger.info(f"-------------------------------------------------------\n") - - - # Push the *original* command to XCom - context['task_instance'].xcom_push(key='ytdlp_command', value=ytdlp_cmd) - logger.info(f"Pushed original command to XCom key 'ytdlp_command'.") - - # Note: Returning ytdlp_cmd below implicitly pushes the same value - # to XCom under the key 'return_value'. Downstream tasks should - # preferably use the explicitly pushed 'ytdlp_command' key for clarity. - return ytdlp_cmd # Return the original command - - except AirflowException as e: # Catch AirflowExceptions raised explicitly in the code above - logger.error(f"Operation failed due to AirflowException: {e}") - raise # Re-raise AirflowExceptions to ensure task failure - except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already wrapped - logger.error(f"Unhandled Thrift/Service error: {e}", exc_info=True) # Add traceback for context - raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException - except Exception as e: # General catch-all for truly unexpected errors - # Log with traceback for unexpected errors - logger.error(f"Caught unexpected error in YtdlpOpsOperator: {e}", exc_info=True) - # Ensure any unexpected error explicitly fails the task with AirflowException - raise AirflowException(f"Unexpected error caused task failure: {e}") - finally: - if transport and transport.isOpen(): # Check if transport exists and is open before closing - logger.info("Closing Thrift transport.") - transport.close() - - # --- Helper Methods --- - - def _get_info_json(self, token_data): - """Safely extracts infoJson from token data.""" - info_json = getattr(token_data, 'infoJson', None) - if info_json: - logger.debug("Extracted infoJson from token data.") - else: - logger.debug("No infoJson attribute found in token data.") - return info_json - - def _is_valid_json(self, json_str): - """Checks if a string is valid JSON.""" - if not json_str or not isinstance(json_str, str): - logger.debug("Input is not a non-empty string, considered invalid JSON.") - return False - try: - json.loads(json_str) - logger.debug("JSON string validation successful.") - return True - except json.JSONDecodeError as e: - logger.warning(f"JSON validation failed: {e}") - return False - - def _save_info_json(self, context, info_json): - """Saves info_json to a file, handling directory creation and logging. Returns the path on success, None on failure.""" - try: - # Get URL from params/context for video ID extraction - url_param = context.get('params', {}).get('url', self.url) - video_id = self._extract_video_id(url_param) # Use internal helper - - # Render the info_json_dir template - save_dir_template = self.info_json_dir or "." # Default to current dir if template is None or empty string - save_dir = self.render_template(save_dir_template, context) - if not save_dir: # Handle case where template renders to empty string - logger.warning(f"Rendered info_json_dir template '{save_dir_template}' resulted in an empty path. Defaulting to '.'") - save_dir = "." - logger.info(f"Target directory for info.json (rendered): {save_dir}") - - # Ensure directory exists - try: - os.makedirs(save_dir, exist_ok=True) - logger.info(f"Ensured directory exists: {save_dir}") - except OSError as e: - logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.") - return None # Indicate failure - - # Construct filename (using potentially overridden account_id) - account_id_param = context.get('params', {}).get('account_id', self.account_id) - timestamp = int(time.time()) - base_filename = f"info_{video_id}_{account_id_param}_{timestamp}.json" if video_id else f"info_{account_id_param}_{timestamp}.json" - info_json_path = os.path.join(save_dir, base_filename) - latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy - - # Write to timestamped file - try: - logger.info(f"Writing info.json content (received from service) to {info_json_path}...") - with open(info_json_path, 'w', encoding='utf-8') as f: - f.write(info_json) - logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}") - except IOError as e: - logger.error(f"Failed to write info.json to {info_json_path}: {e}") - return None # Indicate failure - - # Write to latest.json (overwrite) - best effort - try: - with open(latest_json_path, 'w', encoding='utf-8') as f: - f.write(info_json) - logger.info(f"Updated latest.json file: {latest_json_path}") - except IOError as e: - # Log warning but don't fail the whole save if only latest.json fails - logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}") - - return info_json_path # Return path on success (even if latest.json failed) - - except Exception as e: - logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True) - return None # Indicate failure - - def _extract_video_id(self, url): - """Extracts YouTube video ID from URL (internal helper).""" - if not url or not isinstance(url, str): - logger.debug("URL is empty or not a string, cannot extract video ID.") - return None - try: - # Basic extraction logic (can be enhanced for more URL types) - video_id = None - if 'youtube.com/watch?v=' in url: - video_id = url.split('v=')[1].split('&')[0] - elif 'youtu.be/' in url: - video_id = url.split('youtu.be/')[1].split('?')[0] - - # Ensure it looks like a video ID (typically 11 chars, but can vary) - if video_id and len(video_id) >= 11: - video_id = video_id[:11] # Take first 11 chars as standard ID length - logger.debug(f"Extracted video ID '{video_id}' from URL: {url}") - return video_id - else: - logger.debug(f"Could not extract a standard video ID pattern from URL: {url}") - return None - except Exception as e: - logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}") - return None - - -# ============================================================================= -# Python Callables for Tasks -# ============================================================================= - -def display_token_info(**context): - """Displays token info from XCom, parses info.json, and logs example commands.""" - ti = context['task_instance'] - logger.info("Starting display_token_info task.") - - # Pull data from XCom (provide default values) - info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path') - socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') - ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command') - - logger.info("\n=== Pulled Token Information from XCom ===") - logger.info(f"Info.json path: {info_json_path or 'Not found/Not saved'}") - logger.info(f"SOCKS Proxy: {socks_proxy or 'Not found/Not extracted'}") - logger.info(f"Original yt-dlp command (with tokens): {ytdlp_command or 'Not found'}") - - result = { - 'info_path': info_json_path, - 'proxy': socks_proxy, - 'ytdlp_command': ytdlp_command, - 'video_info': None, - 'commands': {}, - 'error': None - } - - if info_json_path and os.path.exists(info_json_path): - logger.info(f"\n=== Processing Video Information from: {info_json_path} ===") - try: - with open(info_json_path, 'r', encoding='utf-8') as f: - info = json.load(f) - - # Extract and log basic video info safely - title = info.get('title', 'Unknown Title') - uploader = info.get('uploader', 'Unknown Author') - duration = info.get('duration_string', 'Unknown Length') - upload_date_str = info.get('upload_date') # Format: YYYYMMDD - upload_date_formatted = 'Unknown Date' - if upload_date_str: - try: - # Validate format before parsing - if len(upload_date_str) == 8 and upload_date_str.isdigit(): - upload_date_formatted = datetime.strptime(upload_date_str, '%Y%m%d').strftime('%Y-%m-%d') - else: - logger.warning(f"Upload date '{upload_date_str}' is not in YYYYMMDD format.") - except ValueError: - logger.warning(f"Could not parse upload_date '{upload_date_str}'") - - result['video_info'] = { - 'title': title, - 'uploader': uploader, - 'upload_date': upload_date_formatted, # Store formatted date - 'duration': duration - } - - logger.info(f"Title: {title}") - logger.info(f"Author: {uploader}") - logger.info(f"Date: {upload_date_formatted}") - logger.info(f"Length: {duration}") - - logger.info("\n=== Example yt-dlp Commands (using saved info.json) ===") - base_cmd = f"yt-dlp --load-info-json \"{info_json_path}\"" - if socks_proxy: - base_cmd += f" --proxy \"{socks_proxy}\"" - - # Command to list formats - format_cmd = f"{base_cmd} -F" - result['commands']['format'] = format_cmd - logger.info(f"List formats command: {format_cmd}") - - # Execute and log the format listing command - logger.info("\n--- Executing Format List Command ---") - try: - # Use os.popen for simplicity, capture output - logger.info(f"Running: {format_cmd}") - format_output = os.popen(format_cmd).read() - logger.info("--- Format List Output ---") - logger.info(format_output) - logger.info("--------------------------") - except Exception as e: - logger.error(f"Error executing format command: {e}") - - # Command to simulate download - simulate_cmd = f"{base_cmd} --simulate --verbose" # Add verbose for more info - result['commands']['simulate'] = simulate_cmd - logger.info(f"Simulate download command: {simulate_cmd}") - - # Execute and log the simulation command - logger.info("\n--- Executing Simulation Command ---") - try: - logger.info(f"Running: {simulate_cmd}") - simulate_output = os.popen(simulate_cmd).read() - logger.info("--- Simulation Output ---") - logger.info(simulate_output) - logger.info("-------------------------") - except Exception as e: - logger.error(f"Error executing simulation command: {e}") - - # Basic download command - download_cmd = base_cmd - result['commands']['download_base'] = download_cmd - logger.info(f"Base download command (add format selection, output path): {download_cmd}") - - # Push generated example commands to XCom for potential downstream use - # ti.xcom_push(key='format_cmd', value=format_cmd) # Removed as requested - # ti.xcom_push(key='simulate_cmd', value=simulate_cmd) # Removed as requested - ti.xcom_push(key='download_cmd', value=download_cmd) - logger.info(f"Pushed key 'download_cmd' to XCom with value: {download_cmd}") - - except json.JSONDecodeError as e: - error_msg = f"Failed to parse info.json file '{info_json_path}': {e}" - logger.error(error_msg) - result['error'] = error_msg - except FileNotFoundError: - error_msg = f"Info.json file not found at path: {info_json_path}" - logger.error(error_msg) - result['error'] = error_msg - except Exception as e: - error_msg = f"Error processing info.json file '{info_json_path}': {str(e)}" - logger.error(error_msg, exc_info=True) - result['error'] = error_msg - elif info_json_path: - error_msg = f"Info.json path provided ('{info_json_path}') but file does not exist." - logger.warning(error_msg) - result['error'] = error_msg - else: - logger.warning("No info.json path found in XCom. Cannot display video details or generate example commands.") - result['error'] = "Info.json path not available." - - logger.info("Finished display_token_info task.") - # Return the collected information (useful if used as a PythonOperator return value) - return json.dumps(result) # Return as JSON string for XCom compatibility if needed - - -def store_token_info(**context): - """Stores retrieved token information (command, proxy, info.json) in Redis.""" - ti = context['task_instance'] - # Use the redis_conn_id defined in the operator/DAG params if possible, else default - redis_conn_id = context['params'].get('redis_conn_id', 'redis_default') - redis_hook = RedisHook(redis_conn_id=redis_conn_id) - logger.info(f"Starting store_token_info task using Redis connection '{redis_conn_id}'.") - - try: - # Pull necessary data from XCom and context - url = context['params'].get('url') - if not url: - # Attempt to get URL from DAG run conf as fallback - url = context.get('dag_run', {}).conf.get('url') - if not url: - raise ValueError("URL parameter is missing in context['params'] and dag_run.conf") - logger.warning("URL parameter missing in context['params'], using URL from dag_run.conf.") - - - ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command') - socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') or '' # Default to empty string if None - info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path') - - if not ytdlp_command: - logger.warning("ytdlp_command not found in XCom. Storing empty value.") - ytdlp_command = '' # Store empty if not found - - # Construct the base command using info.json - ytdlp_command_base = '' - if info_json_path and os.path.exists(info_json_path): - ytdlp_command_base = f"yt-dlp --load-info-json \"{info_json_path}\"" - logger.info(f"Constructed base command: {ytdlp_command_base}") - else: - logger.warning("Cannot construct base command: info_json_path not valid.") - - # Construct the command with tokens and proxy - ytdlp_command_tokens = ytdlp_command # Start with original command from server - if socks_proxy: - ytdlp_command_tokens += f" --proxy \"{socks_proxy}\"" - logger.info("Appended proxy to token command.") - - data_to_store = { - 'url': url, - 'ytdlp_command': ytdlp_command_base, # Store the base command - 'proxy': socks_proxy, - 'info_json_path': info_json_path or '' # Store path even if None/empty - # 'info_json' will be added below - } - - # Read info.json content if path exists - info_json_content = None - if info_json_path and os.path.exists(info_json_path): - try: - with open(info_json_path, 'r', encoding='utf-8') as f: - # Read and immediately validate JSON structure before storing - info_json_content = json.load(f) - # Store the validated JSON as a string - data_to_store['info_json'] = json.dumps(info_json_content) - logger.info(f"Read and validated info.json content from: {info_json_path}") - except json.JSONDecodeError as e: - logger.error(f"Failed to parse info.json file '{info_json_path}' as JSON: {e}. Storing empty content.") - data_to_store['info_json'] = '' # Store empty string on parse error - except Exception as e: - logger.error(f"Failed to read info.json file '{info_json_path}': {e}. Storing empty content.") - data_to_store['info_json'] = '' # Store empty string on other read errors - else: - logger.warning(f"info_json_path ('{info_json_path}') not found or invalid. Storing without info_json content.") - data_to_store['info_json'] = '' # Store empty string if no path - - # Determine Redis key using video ID - # Use the same helper method as the operator for consistency - # Need an instance or static method call. Let's make _extract_video_id static temporarily - # Or instantiate the operator just for this - less ideal. - # Simplest: Re-implement or assume utils. - # Re-implementing basic logic here for simplicity: - video_id = None - try: - if 'youtube.com/watch?v=' in url: - video_id = url.split('v=')[1].split('&')[0][:11] - elif 'youtu.be/' in url: - video_id = url.split('youtu.be/')[1].split('?')[0][:11] - except Exception: - pass # Ignore errors in ID extraction for key generation - redis_key = f"token_info:{video_id or 'unknown'}" - logger.info(f"Determined Redis key: {redis_key}") - - # Store data in Redis hash - # Log presence/absence rather than full content for potentially large fields - logger.info(f"Data to store in Redis key '{redis_key}': " - f"URL='{data_to_store['url']}', " - f"Command={'' if data_to_store['ytdlp_command'] else ''}, " - f"Proxy='{data_to_store['proxy'] or ''}', " - f"Path='{data_to_store['info_json_path'] or ''}', " - f"JSON Content={'' if data_to_store.get('info_json') else ''}") - - with redis_hook.get_conn() as redis_client: - # Extract video ID from URL - video_id = None - try: - if 'youtube.com/watch?v=' in url: - video_id = url.split('v=')[1].split('&')[0][:11] - elif 'youtu.be/' in url: - video_id = url.split('youtu.be/')[1].split('?')[0][:11] - except Exception: - pass # Ignore errors in ID extraction for key generation - - # Use video ID as part of the Redis key - redis_key = f"token_info:{video_id or 'unknown'}" - logger.info(f"Determined Redis key: {redis_key}") - - # Store data in Redis hash - # Add video_id, timestamp, and the constructed ytdlp_command_tokens - data_to_store['video_id'] = video_id or 'unknown' - data_to_store['timestamp'] = int(time.time()) - data_to_store['ytdlp_command_tokens'] = ytdlp_command_tokens # Store the original token command - - # Log fields being stored - log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()} - logger.info(f"Storing in Redis key '{redis_key}': {log_data}") - - redis_client.hset(redis_key, mapping=data_to_store) - # Set expiration (e.g., 24 hours = 86400 seconds) - redis_client.expire(redis_key, 86400) - logger.info(f"Successfully stored token info in Redis key '{redis_key}' with 24h expiration.") - # Log the final stored data again for clarity - final_log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()} - logger.info(f"--- Final Data Stored in Redis Key '{redis_key}' ---") - logger.info(final_log_data) - logger.info("----------------------------------------------------") - - - except Exception as e: - logger.error(f"Failed to store token info in Redis: {e}", exc_info=True) - # Re-raise as AirflowException to fail the task - raise AirflowException(f"Failed to store token info in Redis: {e}") - - logger.info("Finished store_token_info task.") - - -# ============================================================================= -# DAG Definition -# ============================================================================= - -# Update default_args to match ytdlp_client_dag.py structure -default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'email_on_failure': False, # Match reference DAG - 'email_on_retry': False, # Match reference DAG - 'retries': 1, # Default task retries - 'retry_delay': timedelta(minutes=5), # Standard task retry delay - 'start_date': days_ago(1) # Best practice start date -} - -# Update DAG definition -with DAG( - dag_id='ytdlp_client_dag_v2.1', - default_args=default_args, - schedule_interval=None, # Manually triggered DAG - catchup=False, # Don't run for past missed schedules - description='DAG for YTDLP operations using Thrift client (V2 - Refactored)', # Updated description - tags=['ytdlp', 'thrift', 'client', 'v2'], # Updated tags for better filtering - params={ - # Define DAG parameters with defaults and types for UI clarity - 'url': Param('https://www.youtube.com/watch?v=sOlTX9uxUtM', type=["null", "string"], description="Required: The video URL to process."), # Default URL - 'redis_enabled': Param(False, type="boolean", description="Use Redis for service discovery? If False, uses service_ip/port."), # Default to direct connection - 'service_ip': Param('85.192.30.55', type="string", description="Service IP if redis_enabled=False."), # Default service IP - 'service_port': Param(9090, type="integer", description="Service port if redis_enabled=False."), # Default service port - 'account_id': Param('account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09', type="string", description="Account ID for Redis lookup or direct call."), # Updated default account_id - 'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."), - # Use Airflow Variable for downloads directory, matching reference DAG structure - 'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json. Uses Airflow Variable 'DOWNLOADS_TEMP' or default.") - } -) as dag: - - # Define Tasks - - get_token = YtdlpOpsOperator( - task_id='get_token', - # Pass templated parameters from DAG run config - url="{{ params.url }}", - redis_enabled="{{ params.redis_enabled }}", - service_ip="{{ params.service_ip }}", - service_port="{{ params.service_port }}", - account_id="{{ params.account_id }}", - save_info_json=True, - info_json_dir="{{ params.info_json_dir }}", - get_socks_proxy=True, - store_socks_proxy=True, - timeout="{{ params.timeout }}", - retries=MAX_RETRIES, # Operator-specific retries if needed, else use DAG default - retry_delay=RETRY_DELAY, # Operator-specific delay if needed - # Add callbacks for logging success/failure, similar to reference DAG - on_failure_callback=lambda context: logger.error(f"Task {context['task_instance_key_str']} failed."), - on_success_callback=lambda context: logger.info(f"Task {context['task_instance_key_str']} succeeded.") - ) - # Add task documentation (visible in Airflow UI) - get_token.doc_md = """ - ### Get Token Task - Connects to the YTDLP Thrift service (either directly or via Redis discovery) - to retrieve an authentication token and video metadata (info.json). - - **Pushes to XCom:** - - `info_json_path`: Path to the saved info.json file (or None if not saved/failed). - - `socks_proxy`: The extracted SOCKS proxy string (or None if not requested/found). - - `ytdlp_command`: The original command string received from the server (contains tokens/cookies). - - - Uses parameters defined in the DAG run configuration. - """ - - # Optional: Add a task to explicitly check XComs for debugging (like in reference DAG) - def _check_xcom_callable(**context): - """Logs XCom values pushed by the get_token task.""" - ti = context['task_instance'] - logger.info("--- Checking XCom values pushed by get_token ---") - keys_to_check = ['info_json_path', 'socks_proxy', 'ytdlp_command'] - xcom_values = {} - for key in keys_to_check: - value = ti.xcom_pull(task_ids='get_token', key=key) - xcom_values[key] = value - # Avoid logging potentially sensitive command details fully in production - if key == 'ytdlp_command' and value: - log_value = f"{value[:50]}..." # Log truncated command - else: - log_value = value - logger.info(f"XCom key='{key}': {log_value}") - logger.info("----------------------------------------------") - return xcom_values # Return values for potential future use - - check_xcom_task = PythonOperator( - task_id='check_xcom_after_get_token', - python_callable=_check_xcom_callable, - ) - check_xcom_task.doc_md = "Logs the values pushed to XCom by the 'get_token' task for debugging purposes." - - display_info = PythonOperator( - task_id='display_token_info', - python_callable=display_token_info, - trigger_rule='all_success' - ) - display_info.doc_md = """ - ### Display Token Info Task - Pulls information from XCom, parses the `info.json` file (if available), - logs video details, and generates example `yt-dlp` commands. - - **Pulls from XCom (task_id='get_token'):** - - `info_json_path` - - `socks_proxy` - - `ytdlp_command` - - **Pushes to XCom:** - - `download_cmd`: Base command using `--load-info-json` (user needs to add format/output). - """ - - store_info = PythonOperator( - task_id='store_token_info', # Use consistent task ID naming - python_callable=store_token_info, - ) - store_info.doc_md = """ - ### Store Token Info Task - Pulls information from XCom and DAG parameters, reads the `info.json` content, - and stores relevant data in a Redis hash. - - **Pulls from XCom (task_id='get_token'):** - - `ytdlp_command` - - `socks_proxy` - - `info_json_path` - - **Pulls from DAG context:** - - `params['url']` (or `dag_run.conf['url']`) - - **Stores in Redis Hash (key: `token_info:`):** - - `url`: The video URL. - - `ytdlp_command`: Base command using `--load-info-json`. - - `proxy`: The SOCKS proxy string. - - `info_json_path`: Path to the saved info.json file. - - `info_json`: The full content of the info.json file (as a JSON string). - - `video_id`: Extracted video ID. - - `timestamp`: Unix timestamp of storage. - - `ytdlp_command_tokens`: The original command string from the server (contains tokens/cookies). - - Sets a 24-hour expiration on the Redis key. - """ - - # Define task dependencies matching the reference DAG structure - get_token >> check_xcom_task >> display_info >> store_info diff --git a/airflow/dags-backup/ytdlp_mgmt_queue_check_status.py b/airflow/dags-backup/ytdlp_mgmt_queue_check_status.py deleted file mode 100644 index 17c75b0..0000000 --- a/airflow/dags-backup/ytdlp_mgmt_queue_check_status.py +++ /dev/null @@ -1,179 +0,0 @@ -# -*- coding: utf-8 -*- -# vim:fenc=utf-8 -# -# Copyright © 2024 rl -# -# Distributed under terms of the MIT license. - -""" -Airflow DAG for manually checking the status (type and size) of a specific Redis key used by YTDLP queues. -""" - -from airflow import DAG -from airflow.exceptions import AirflowException -from airflow.models.param import Param -from airflow.operators.python import PythonOperator -from airflow.providers.redis.hooks.redis import RedisHook -from airflow.utils.dates import days_ago -from datetime import datetime, timedelta, timezone -import logging -import json -import redis # Import redis exceptions if needed - -# Configure logging -logger = logging.getLogger(__name__) - -# Default settings -DEFAULT_REDIS_CONN_ID = 'redis_default' -DEFAULT_QUEUE_BASE_NAME = 'video_queue' -DEFAULT_MAX_ITEMS_TO_LIST = 25 - -# Import utility functions -from utils.redis_utils import _get_redis_client - -# --- Python Callable for Check and List Task --- - -def check_and_list_queue_callable(**context): - """Checks the type and size of a Redis key and lists its recent contents.""" - params = context['params'] - redis_conn_id = params['redis_conn_id'] - # queue_suffix is passed from the PythonOperator's op_kwargs, which are available in the context - queue_suffix = context['queue_suffix'] - queue_name = params.get('queue_name', DEFAULT_QUEUE_BASE_NAME) - queue_to_check = f"{queue_name}{queue_suffix}" - max_items = int(params.get('max_items_to_list', DEFAULT_MAX_ITEMS_TO_LIST)) - - logger.info(f"--- Checking Status and Contents of Redis Key: '{queue_to_check}' ---") - logger.info(f"Using connection '{redis_conn_id}', listing up to {max_items} items.") - - try: - redis_client = _get_redis_client(redis_conn_id) - key_type_bytes = redis_client.type(queue_to_check) - key_type = key_type_bytes.decode('utf-8') - - if key_type == 'list': - list_length = redis_client.llen(queue_to_check) - logger.info(f"Redis key '{queue_to_check}' is a LIST with {list_length} items.") - if list_length > 0: - items_to_fetch = min(max_items, list_length) - # lrange with negative indices gets items from the end (most recent for rpush) - contents_bytes = redis_client.lrange(queue_to_check, -items_to_fetch, -1) - contents = [item.decode('utf-8') for item in contents_bytes] - contents.reverse() # Show most recent first - logger.info(f"--- Showing most recent {len(contents)} of {list_length} items ---") - for i, item in enumerate(contents): - logger.info(f" [recent_{i}]: {item}") - if list_length > len(contents): - logger.info(f" ... ({list_length - len(contents)} older items not shown)") - logger.info(f"--- End of List Contents ---") - - elif key_type == 'hash': - hash_size = redis_client.hlen(queue_to_check) - logger.info(f"Redis key '{queue_to_check}' is a HASH with {hash_size} fields.") - if hash_size > 0: - logger.info(f"--- Showing a sample of up to {max_items} fields ---") - item_count = 0 - # Using hscan_iter to safely iterate over hash fields, count is a hint - for field_bytes, value_bytes in redis_client.hscan_iter(queue_to_check, count=max_items): - if item_count >= max_items: - logger.info(f" ... (stopped listing after {max_items} items of {hash_size})") - break - field = field_bytes.decode('utf-8') - value = value_bytes.decode('utf-8') - # Try to pretty-print if value is JSON - try: - parsed_value = json.loads(value) - # Check for timestamp to show age - timestamp = parsed_value.get('end_time') or parsed_value.get('start_time') - age_str = "" - if timestamp: - age_seconds = (datetime.now(timezone.utc) - datetime.fromtimestamp(timestamp, timezone.utc)).total_seconds() - age_str = f" (age: {timedelta(seconds=age_seconds)})" - - pretty_value = json.dumps(parsed_value, indent=2) - logger.info(f" Field '{field}'{age_str}:\n{pretty_value}") - except (json.JSONDecodeError, TypeError): - logger.info(f" Field '{field}': {value}") - item_count += 1 - logger.info(f"--- End of Hash Contents ---") - - elif key_type == 'none': - logger.info(f"Redis key '{queue_to_check}' does not exist.") - else: - logger.info(f"Redis key '{queue_to_check}' is of type '{key_type}'. Listing contents for this type is not implemented.") - - except Exception as e: - logger.error(f"Failed to check/list contents of Redis key '{queue_to_check}': {e}", exc_info=True) - raise AirflowException(f"Failed to process Redis key: {e}") - -# --- DAG Definition --- -default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 0, # No retries for a manual check/list operation - 'start_date': days_ago(1) -} - -with DAG( - dag_id='ytdlp_mgmt_queues_check_status', - default_args=default_args, - schedule_interval=None, # Manually triggered - catchup=False, - description='Manually check the status and recent items of all YTDLP Redis queues for a given base name.', - tags=['ytdlp', 'queue', 'management', 'redis', 'manual', 'status', 'list'], - params={ - 'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."), - 'queue_name': Param( - DEFAULT_QUEUE_BASE_NAME, - type="string", - description="Base name for the Redis queues (e.g., 'video_queue')." - ), - 'max_items_to_list': Param(DEFAULT_MAX_ITEMS_TO_LIST, type="integer", description="Maximum number of recent items/fields to list from each queue."), - } -) as dag: - - check_inbox_queue = PythonOperator( - task_id='check_inbox_queue', - python_callable=check_and_list_queue_callable, - op_kwargs={'queue_suffix': '_inbox'}, - ) - check_inbox_queue.doc_md = """ - ### Check Inbox Queue (`_inbox`) - Checks the status and lists the most recent URLs waiting to be processed. - The full queue name is `{{ params.queue_name }}_inbox`. - """ - - check_progress_queue = PythonOperator( - task_id='check_progress_queue', - python_callable=check_and_list_queue_callable, - op_kwargs={'queue_suffix': '_progress'}, - ) - check_progress_queue.doc_md = """ - ### Check Progress Queue (`_progress`) - Checks the status and lists a sample of URLs currently being processed. - The full queue name is `{{ params.queue_name }}_progress`. - """ - - check_result_queue = PythonOperator( - task_id='check_result_queue', - python_callable=check_and_list_queue_callable, - op_kwargs={'queue_suffix': '_result'}, - ) - check_result_queue.doc_md = """ - ### Check Result Queue (`_result`) - Checks the status and lists a sample of successfully processed URLs. - The full queue name is `{{ params.queue_name }}_result`. - """ - - check_fail_queue = PythonOperator( - task_id='check_fail_queue', - python_callable=check_and_list_queue_callable, - op_kwargs={'queue_suffix': '_fail'}, - ) - check_fail_queue.doc_md = """ - ### Check Fail Queue (`_fail`) - Checks the status and lists a sample of failed URLs. - The full queue name is `{{ params.queue_name }}_fail`. - """ diff --git a/airflow/dags-backup/ytdlp_ops_worker_per_url.py b/airflow/dags-backup/ytdlp_ops_worker_per_url.py deleted file mode 100644 index 52797cf..0000000 --- a/airflow/dags-backup/ytdlp_ops_worker_per_url.py +++ /dev/null @@ -1,343 +0,0 @@ -# -*- coding: utf-8 -*- -# vim:fenc=utf-8 -# -# Copyright © 2024 rl -# -# Distributed under terms of the MIT license. - -""" -DAG for processing a single YouTube URL passed via DAG run configuration. -This is the "Worker" part of a Sensor/Worker pattern. -This DAG has been refactored to use the TaskFlow API to implement worker affinity, -ensuring all tasks for a single URL run on the same machine. -""" - -from __future__ import annotations - -from airflow.decorators import task, task_group -from airflow.exceptions import AirflowException, AirflowSkipException -from airflow.models import Variable -from airflow.models.dag import DAG -from airflow.models.param import Param -from airflow.models.xcom_arg import XComArg -from airflow.operators.dummy import DummyOperator -from airflow.operators.bash import BashOperator -from airflow.utils.dates import days_ago -from airflow.api.common.trigger_dag import trigger_dag -from datetime import timedelta, datetime -import json -import logging -import os -import random -import re -import socket -import time -import traceback -import uuid -import subprocess -import shlex - -# Import utility functions and Thrift modules -from utils.redis_utils import _get_redis_client - -# Handle potential import issues with Thrift modules -try: - from pangramia.yt.common.ttypes import TokenUpdateMode -except ImportError as e: - logging.warning(f"Could not import TokenUpdateMode from pangramia.yt.common.ttypes: {e}") - TokenUpdateMode = None - -try: - from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException -except ImportError as e: - logging.warning(f"Could not import PBServiceException/PBUserException from pangramia.yt.exceptions.ttypes: {e}") - PBServiceException = Exception - PBUserException = Exception - -try: - from pangramia.yt.tokens_ops import YTTokenOpService -except ImportError as e: - logging.warning(f"Could not import YTTokenOpService from pangramia.yt.tokens_ops: {e}") - YTTokenOpService = None - -try: - from thrift.protocol import TBinaryProtocol - from thrift.transport import TSocket, TTransport - from thrift.transport.TTransport import TTransportException -except ImportError as e: - logging.warning(f"Could not import thrift modules: {e}") - TBinaryProtocol = None - TSocket = None - TTransport = None - TTransportException = Exception - -# Configure logging -logger = logging.getLogger(__name__) - -# Default settings from Airflow Variables or hardcoded fallbacks -DEFAULT_QUEUE_NAME = 'video_queue' -DEFAULT_REDIS_CONN_ID = 'redis_default' -DEFAULT_TIMEOUT = 3600 -DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1") -DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080) - -# The queue is set to a fallback here. The actual worker-specific queue is -# assigned just-in-time by the task_instance_mutation_hook in airflow_local_settings.py, -# which reads the 'worker_queue' from the DAG run configuration. -DEFAULT_ARGS = { - 'owner': 'airflow', - 'retries': 0, - 'queue': 'queue-dl', # Fallback queue. Will be overridden by the policy hook. -} - - -# --- Helper Functions --- - -def _get_thrift_client(host, port, timeout): - """Helper to create and connect a Thrift client.""" - if not TSocket or not TTransport or not TBinaryProtocol: - raise AirflowException("Required Thrift modules are not available") - - transport = TSocket.TSocket(host, port) - transport.setTimeout(timeout * 1000) - transport = TTransport.TFramedTransport(transport) - protocol = TBinaryProtocol.TBinaryProtocolFactory() - client = YTTokenOpService.Client(protocol) if YTTokenOpService else None - if client: - transport.open() - logger.info(f"Connected to Thrift server at {host}:{port}") - return client, transport - -def _extract_video_id(url): - """Extracts YouTube video ID from URL.""" - if not url or not isinstance(url, str): - return None - patterns = [r'v=([a-zA-Z0-9_-]{11})', r'youtu\.be/([a-zA-Z0-9_-]{11})'] - for pattern in patterns: - match = re.search(pattern, url) - if match: - return match.group(1) - return None - -def _get_account_pool(params: dict) -> list: - """ - Gets the list of accounts to use for processing, filtering out banned/resting accounts. - Supports explicit list, prefix-based generation, and single account modes. - """ - account_pool_str = params.get('account_pool', 'default_account') - accounts = [] - is_prefix_mode = False - - if ',' in account_pool_str: - accounts = [acc.strip() for acc in account_pool_str.split(',') if acc.strip()] - else: - prefix = account_pool_str - pool_size_param = params.get('account_pool_size') - if pool_size_param is not None: - is_prefix_mode = True - pool_size = int(pool_size_param) - accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)] - else: - accounts = [prefix] - - if not accounts: - raise AirflowException("Initial account pool is empty.") - - redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID) - try: - redis_client = _get_redis_client(redis_conn_id) - active_accounts = [] - for account in accounts: - status_bytes = redis_client.hget(f"account_status:{account}", "status") - status = status_bytes.decode('utf-8') if status_bytes else "ACTIVE" - if status not in ['BANNED'] and 'RESTING' not in status: - active_accounts.append(account) - - if not active_accounts and accounts: - auto_create = params.get('auto_create_new_accounts_on_exhaustion', False) - if auto_create and is_prefix_mode: - new_account_id = f"{account_pool_str}-auto-{str(uuid.uuid4())[:8]}" - logger.warning(f"Account pool exhausted. Auto-creating new account: '{new_account_id}'") - active_accounts.append(new_account_id) - else: - raise AirflowException("All accounts in the configured pool are currently exhausted.") - accounts = active_accounts - except Exception as e: - logger.error(f"Could not filter accounts from Redis. Using unfiltered pool. Error: {e}", exc_info=True) - - if not accounts: - raise AirflowException("Account pool is empty after filtering.") - - logger.info(f"Final active account pool with {len(accounts)} accounts.") - return accounts - -# ============================================================================= -# TASK DEFINITIONS (TaskFlow API) -# ============================================================================= - -@task -def get_url_and_assign_account(**context): - """ - Gets the URL to process from the DAG run configuration and assigns an active account. - This is the first task in the pinned-worker DAG. - """ - params = context['params'] - - # Update yt-dlp to latest nightly before every run - subprocess.run(["/usr/local/bin/update-yt-dlp.sh"], check=True) - - # The URL is passed by the dispatcher DAG. - url_to_process = params.get('url_to_process') - if not url_to_process: - raise AirflowException("'url_to_process' was not found in the DAG run configuration.") - logger.info(f"Received URL '{url_to_process}' to process.") - - # Account assignment logic is the same as before. - account_id = random.choice(_get_account_pool(params)) - logger.info(f"Selected account '{account_id}' for this run.") - - return { - 'url_to_process': url_to_process, - 'account_id': account_id, - 'accounts_tried': [account_id], - } - -@task -def get_token(initial_data: dict, **context): - """Makes a single attempt to get a token from the Thrift service.""" - ti = context['task_instance'] - params = context['params'] - - account_id = initial_data['account_id'] - url = initial_data['url_to_process'] - info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') - - host, port, timeout = params['service_ip'], int(params['service_port']), int(params.get('timeout', DEFAULT_TIMEOUT)) - machine_id = params.get('machine_id') or socket.gethostname() - - logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' ---") - client, transport = None, None - try: - client, transport = _get_thrift_client(host, port, timeout) - if not client or not TokenUpdateMode: - raise AirflowException("Thrift client or TokenUpdateMode not available") - - token_data = client.getOrRefreshToken(accountId=account_id, updateType=TokenUpdateMode.AUTO, url=url, clients=params.get('clients'), machineId=machine_id) - - info_json = getattr(token_data, 'infoJson', None) - if not (info_json and json.loads(info_json)): - raise AirflowException("Service returned success but info.json was empty or invalid.") - - video_id = _extract_video_id(url) - os.makedirs(info_json_dir, exist_ok=True) - # Use a readable timestamp for a unique filename on each attempt. - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - info_json_path = os.path.join(info_json_dir, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json") - with open(info_json_path, 'w', encoding='utf-8') as f: - f.write(info_json) - - proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None) - return { - 'info_json_path': info_json_path, - 'socks_proxy': getattr(token_data, proxy_attr) if proxy_attr else None, - 'ytdlp_command': getattr(token_data, proxy_attr) if proxy_attr else None, - 'successful_account_id': account_id, - 'original_url': url, # Include original URL for fallback - } - except (PBServiceException, PBUserException, TTransportException) as e: - error_context = getattr(e, 'context', None) - if isinstance(error_context, str): - try: error_context = json.loads(error_context.replace("'", "\"")) - except: pass - - error_details = { - 'error_message': getattr(e, 'message', str(e)), - 'error_code': getattr(e, 'errorCode', 'TRANSPORT_ERROR'), - 'proxy_url': error_context.get('proxy_url') if isinstance(error_context, dict) else None - } - logger.error(f"Thrift call failed for account '{account_id}'. Exception: {error_details['error_message']}") - ti.xcom_push(key='error_details', value=error_details) - - # If it's not a connection error, run diagnostic yt-dlp command - if error_details['error_code'] not in ["SOCKS5_CONNECTION_FAILED", "SOCKET_TIMEOUT", "TRANSPORT_ERROR", "CAMOUFOX_TIMEOUT"]: - _run_diagnostic_yt_dlp(url, error_details.get('proxy_url'), params.get('clients', 'web')) - - raise AirflowException(f"Thrift call failed: {error_details['error_message']}") - finally: - if transport and transport.isOpen(): - transport.close() - -def _run_diagnostic_yt_dlp(url, proxy, clients): - """Runs yt-dlp with diagnostic flags to capture failed responses.""" - logger.warning("Running diagnostic yt-dlp command to capture failed response...") - - dump_dir = "/opt/airflow/dumps" - os.makedirs(dump_dir, exist_ok=True) - - video_id = _extract_video_id(url) - dump_file = os.path.join(dump_dir, f"diagnostic_{video_id}_{int(time.time())}.dump") - - cmd = [ - 'yt-dlp', - '--extractor-args', f'youtube:player-client={clients}', - '--write-pages', - '--proxy', proxy or '', - '-FvU', - url, - '--write-info-json', - '--print', 'filename', - '--continue', - '--no-progress', - '--no-simulate', - '--ignore-errors', - '--no-playlist' - ] - - logger.info(f"Executing diagnostic command: {' '.join(shlex.quote(arg) for arg in cmd)}") - logger.info(f"Diagnostic dump will be saved to: {dump_file}") - - try: - result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) - logger.info(f"Diagnostic yt-dlp exit code: {result.returncode}") - if result.stdout: - logger.info(f"Diagnostic output:\n{result.stdout}") - if result.stderr: - logger.error(f"Diagnostic stderr:\n{result.stderr}") - except subprocess.TimeoutExpired: - logger.error("Diagnostic yt-dlp command timed out after 5 minutes") - except Exception as e: - logger.error(f"Failed to run diagnostic yt-dlp: {e}") - -@task.branch -def handle_bannable_error_branch(task_id_to_check: str, **context): - """Inspects a failed task and routes to retry logic if the error is bannable.""" - ti = context['task_instance'] - params = context['params'] - error_details = ti.xcom_pull(task_ids=task_id_to_check, key='error_details') - if not error_details: - return None # Let DAG fail for unexpected errors - - error_code = error_details.get('error_code', '').strip() - policy = params.get('on_bannable_failure', 'retry_with_new_account') - - # Connection errors should be retried without banning the account. - connection_errors = ['SOCKS5_CONNECTION_FAILED', 'SOCKET_TIMEOUT', 'TRANSPORT_ERROR', 'CAMOUFOX_TIMEOUT'] - if error_code in connection_errors: - logger.info(f"Handling connection error '{error_code}' from '{task_id_to_check}'. Policy: '{policy}'") - if policy == 'stop_loop': - logger.warning(f"Connection error with 'stop_loop' policy. Failing DAG without banning.") - return None - else: - logger.info("Retrying with a new account without banning.") - return 'assign_new_account_for_retry' - - is_bannable = error_code in ["BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED"] - - logger.info(f"Handling failure from '{task_id_to_check}'. Error code: '{error_code}', Policy: '{policy}'") - if is_bannable and policy in ['retry_with_new_account', 'retry_and_ban_account_only']: - return 'ban_account_and_prepare_for_retry' - if is_bannable and policy in ['retry_on_connection_error', 'retry_without_ban']: - return 'assign_new_account_for_retry' - if is_bannable: # stop_loop - return 'ban_and_fail' - return None # Not a bannable error, let DAG fail diff --git a/airflow/dags-backup/ytdlp_proc_sequential_processor.py b/airflow/dags-backup/ytdlp_proc_sequential_processor.py deleted file mode 100644 index 08c7c6d..0000000 --- a/airflow/dags-backup/ytdlp_proc_sequential_processor.py +++ /dev/null @@ -1,707 +0,0 @@ -# -*- coding: utf-8 -*- -# vim:fenc=utf-8 -# -# Copyright © 2024 rl -# -# Distributed under terms of the MIT license. - -""" -DAG for processing YouTube URLs sequentially from a Redis queue using YTDLP Ops Thrift service. -""" - -from airflow import DAG -from airflow.exceptions import AirflowException, AirflowSkipException, AirflowFailException -from airflow.hooks.base import BaseHook -from airflow.models import BaseOperator, Variable -from airflow.models.param import Param -from airflow.operators.bash import BashOperator # Import BashOperator -from airflow.operators.python import PythonOperator -from airflow.operators.trigger_dagrun import TriggerDagRunOperator -from airflow.providers.redis.hooks.redis import RedisHook -from airflow.utils.dates import days_ago -from airflow.utils.decorators import apply_defaults -from datetime import datetime, timedelta -from pangramia.yt.common.ttypes import TokenUpdateMode -from pangramia.yt.exceptions.ttypes import PBServiceException -from pangramia.yt.tokens_ops import YTTokenOpService -from thrift.protocol import TBinaryProtocol -from thrift.transport import TSocket, TTransport -from thrift.transport.TTransport import TTransportException -import json -import logging -import os -import redis # Import redis exceptions if needed -import socket -import time -import traceback # For logging stack traces in failure handler - -# Configure logging -logger = logging.getLogger(__name__) - -# Default settings -DEFAULT_QUEUE_NAME = 'video_queue' # Base name for queues -DEFAULT_REDIS_CONN_ID = 'redis_default' -DEFAULT_TIMEOUT = 30 # Default Thrift timeout in seconds -MAX_RETRIES_REDIS_LOOKUP = 3 # Retries for fetching service details from Redis -RETRY_DELAY_REDIS_LOOKUP = 10 # Delay (seconds) for Redis lookup retries - -# --- Helper Functions --- - -from utils.redis_utils import _get_redis_client - -def _extract_video_id(url): - """Extracts YouTube video ID from URL.""" - if not url or not isinstance(url, str): - logger.debug("URL is empty or not a string, cannot extract video ID.") - return None - try: - video_id = None - if 'youtube.com/watch?v=' in url: - video_id = url.split('v=')[1].split('&')[0] - elif 'youtu.be/' in url: - video_id = url.split('youtu.be/')[1].split('?')[0] - - if video_id and len(video_id) >= 11: - video_id = video_id[:11] # Standard ID length - logger.debug(f"Extracted video ID '{video_id}' from URL: {url}") - return video_id - else: - logger.debug(f"Could not extract a standard video ID pattern from URL: {url}") - return None - except Exception as e: - logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}") - return None - -# --- Queue Management Callables --- - -def pop_url_from_queue(**context): - """Pops a URL from the inbox queue and pushes to XCom.""" - params = context['params'] - queue_name = params['queue_name'] - inbox_queue = f"{queue_name}_inbox" - redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID) - logger.info(f"Attempting to pop URL from inbox queue: {inbox_queue}") - - try: - client = _get_redis_client(redis_conn_id) - # LPOP is non-blocking, returns None if empty - url_bytes = client.lpop(inbox_queue) # Returns bytes if decode_responses=False on hook/client - - if url_bytes: - url = url_bytes.decode('utf-8') if isinstance(url_bytes, bytes) else url_bytes - logger.info(f"Popped URL: {url}") - context['task_instance'].xcom_push(key='current_url', value=url) - return url # Return URL for logging/potential use - else: - logger.info(f"Inbox queue '{inbox_queue}' is empty. Skipping downstream tasks.") - context['task_instance'].xcom_push(key='current_url', value=None) - # Raise AirflowSkipException to signal downstream tasks to skip - raise AirflowSkipException(f"Inbox queue '{inbox_queue}' is empty.") - except AirflowSkipException: - raise # Re-raise skip exception - except Exception as e: - logger.error(f"Error popping URL from Redis queue '{inbox_queue}': {e}", exc_info=True) - raise AirflowException(f"Failed to pop URL from Redis: {e}") - - -def move_url_to_progress(**context): - """Moves the current URL from XCom to the progress hash.""" - ti = context['task_instance'] - url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url') - - # This task should be skipped if pop_url_from_queue raised AirflowSkipException - # Adding check for robustness - if not url: - logger.info("No URL found in XCom (or upstream skipped). Skipping move to progress.") - raise AirflowSkipException("No URL to process.") - - params = context['params'] - queue_name = params['queue_name'] - progress_queue = f"{queue_name}_progress" - redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID) - logger.info(f"Moving URL '{url}' to progress hash: {progress_queue}") - - progress_data = { - 'status': 'processing', - 'start_time': time.time(), - 'dag_run_id': context['dag_run'].run_id, - 'task_instance_key_str': context['task_instance_key_str'] - } - - try: - client = _get_redis_client(redis_conn_id) - client.hset(progress_queue, url, json.dumps(progress_data)) - logger.info(f"Moved URL '{url}' to progress hash '{progress_queue}'.") - except Exception as e: - logger.error(f"Error moving URL to Redis progress hash '{progress_queue}': {e}", exc_info=True) - # If this fails, the URL is popped but not tracked as processing. Fail the task. - raise AirflowException(f"Failed to move URL to progress hash: {e}") - - -def handle_success(**context): - """Moves URL from progress to result hash on success.""" - ti = context['task_instance'] - url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url') - if not url: - logger.warning("handle_success called but no URL found from pop_url_from_queue XCom. This shouldn't happen on success path.") - return # Or raise error - - params = context['params'] - queue_name = params['queue_name'] - progress_queue = f"{queue_name}_progress" - result_queue = f"{queue_name}_result" - redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID) - - # Pull results from get_token task - info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path') - socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') - ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command') # Original command - downloaded_file_path = ti.xcom_pull(task_ids='download_video') # Pull from download_video task - - logger.info(f"Handling success for URL: {url}") - logger.info(f" Info JSON Path: {info_json_path}") - logger.info(f" SOCKS Proxy: {socks_proxy}") - logger.info(f" YTDLP Command: {ytdlp_command[:100] if ytdlp_command else 'None'}...") # Log truncated command - logger.info(f" Downloaded File Path: {downloaded_file_path}") - - result_data = { - 'status': 'success', - 'end_time': time.time(), - 'info_json_path': info_json_path, - 'socks_proxy': socks_proxy, - 'ytdlp_command': ytdlp_command, - 'downloaded_file_path': downloaded_file_path, - 'url': url, - 'dag_run_id': context['dag_run'].run_id, - 'task_instance_key_str': context['task_instance_key_str'] # Record which task instance succeeded - } - - try: - client = _get_redis_client(redis_conn_id) - # Remove from progress hash - removed_count = client.hdel(progress_queue, url) - if removed_count > 0: - logger.info(f"Removed URL '{url}' from progress hash '{progress_queue}'.") - else: - logger.warning(f"URL '{url}' not found in progress hash '{progress_queue}' during success handling.") - - # Add to result hash - client.hset(result_queue, url, json.dumps(result_data)) - logger.info(f"Stored success result for URL '{url}' in result hash '{result_queue}'.") - - except Exception as e: - logger.error(f"Error handling success in Redis for URL '{url}': {e}", exc_info=True) - # Even if Redis fails, the task succeeded. Log error but don't fail the task. - # Consider adding retry logic for Redis operations here or marking state differently. - - -def handle_failure(**context): - """ - Handles failed processing. Depending on the `requeue_on_failure` parameter, - it either moves the URL to the fail hash or re-queues it in the inbox. - If `stop_on_failure` is True, this task will fail, stopping the DAG loop. - """ - ti = context['task_instance'] - url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url') - if not url: - logger.error("handle_failure called but no URL found from pop_url_from_queue XCom.") - return - - params = context['params'] - queue_name = params['queue_name'] - progress_queue = f"{queue_name}_progress" - fail_queue = f"{queue_name}_fail" - inbox_queue = f"{queue_name}_inbox" - redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID) - requeue_on_failure = params.get('requeue_on_failure', False) - stop_on_failure = params.get('stop_on_failure', True) # Default to True - - exception = context.get('exception') - error_message = str(exception) if exception else "Unknown error" - tb_str = traceback.format_exc() if exception else "No traceback available." - - logger.info(f"Handling failure for URL: {url}") - logger.error(f" Failure Reason: {error_message}") - logger.debug(f" Traceback:\n{tb_str}") - - try: - client = _get_redis_client(redis_conn_id) - # Always remove from progress hash first - removed_count = client.hdel(progress_queue, url) - if removed_count > 0: - logger.info(f"Removed URL '{url}' from progress hash '{progress_queue}'.") - else: - logger.warning(f"URL '{url}' not found in progress hash '{progress_queue}' during failure handling.") - - if requeue_on_failure: - # Re-queue the URL for another attempt - client.rpush(inbox_queue, url) - logger.info(f"Re-queued failed URL '{url}' to inbox '{inbox_queue}' for retry.") - else: - # Move to the permanent fail hash - fail_data = { - 'status': 'failed', - 'end_time': time.time(), - 'error': error_message, - 'traceback': tb_str, - 'url': url, - 'dag_run_id': context['dag_run'].run_id, - 'task_instance_key_str': context['task_instance_key_str'] - } - client.hset(fail_queue, url, json.dumps(fail_data)) - logger.info(f"Stored failure details for URL '{url}' in fail hash '{fail_queue}'.") - - except Exception as e: - logger.error(f"Error during failure handling in Redis for URL '{url}': {e}", exc_info=True) - # This is a critical error in the failure handling logic itself. - raise AirflowException(f"Could not handle failure in Redis: {e}") - - # After handling Redis, decide whether to fail the task to stop the loop - if stop_on_failure: - logger.error("stop_on_failure is True. Failing this task to stop the DAG loop.") - # Re-raise the original exception to fail the task instance. - # This is better than AirflowFailException because it preserves the original error. - if exception: - raise exception - else: - # If for some reason there's no exception, fail explicitly. - raise AirflowFailException("Failing task as per stop_on_failure=True, but original exception was not found.") - - -# --- YtdlpOpsOperator --- - -class YtdlpOpsOperator(BaseOperator): - """ - Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections - and Redis-based discovery, retrieves tokens, saves info.json, and manages errors. - Modified to pull URL from XCom for sequential processing. - """ - # Removed 'url' from template_fields as it's pulled from XCom - template_fields = ('service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir', 'redis_conn_id') - - @apply_defaults - def __init__(self, - # url parameter removed - will be pulled from XCom - redis_conn_id=DEFAULT_REDIS_CONN_ID, - max_retries_lookup=MAX_RETRIES_REDIS_LOOKUP, - retry_delay_lookup=RETRY_DELAY_REDIS_LOOKUP, - service_ip=None, - service_port=None, - redis_enabled=False, # Default to direct connection now - account_id=None, - # save_info_json removed, always True - info_json_dir=None, - # get_socks_proxy removed, always True - # store_socks_proxy removed, always True - # get_socks_proxy=True, # Removed - # store_socks_proxy=True, # Store proxy in XCom by default # Removed - timeout=DEFAULT_TIMEOUT, - *args, **kwargs): - super().__init__(*args, **kwargs) - - logger.info(f"Initializing YtdlpOpsOperator (Processor Version) with parameters: " - f"redis_conn_id={redis_conn_id}, max_retries_lookup={max_retries_lookup}, retry_delay_lookup={retry_delay_lookup}, " - f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, " - f"account_id={account_id}, info_json_dir={info_json_dir}, timeout={timeout}") - # save_info_json, get_socks_proxy, store_socks_proxy removed from log - - # Validate parameters based on connection mode - if redis_enabled: - # If using Redis, account_id is essential for lookup - if not account_id: - raise ValueError("account_id is required when redis_enabled=True for service lookup.") - else: - # If direct connection, IP and Port are essential - if not service_ip or not service_port: - raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False.") - # Account ID is still needed for the API call itself, but rely on DAG param or operator config - if not account_id: - logger.warning("No account_id provided for direct connection mode. Ensure it's set in DAG params or operator config.") - # We won't assign 'default' here, let the value passed during instantiation be used. - - # self.url is no longer needed here - self.redis_conn_id = redis_conn_id - self.max_retries_lookup = max_retries_lookup - self.retry_delay_lookup = int(retry_delay_lookup.total_seconds() if isinstance(retry_delay_lookup, timedelta) else retry_delay_lookup) - self.service_ip = service_ip - self.service_port = service_port - self.redis_enabled = redis_enabled - self.account_id = account_id - # self.save_info_json removed - self.info_json_dir = info_json_dir # Still needed - # self.get_socks_proxy removed - # self.store_socks_proxy removed - self.timeout = timeout - - def execute(self, context): - logger.info("Executing YtdlpOpsOperator (Processor Version)") - transport = None - ti = context['task_instance'] # Get task instance for XCom access - - try: - # --- Get URL from XCom --- - url = ti.xcom_pull(task_ids='pop_url_from_queue', key='current_url') - if not url: - # This should ideally be caught by upstream skip, but handle defensively - logger.info("No URL found in XCom from pop_url_from_queue. Skipping execution.") - raise AirflowSkipException("Upstream task did not provide a URL.") - logger.info(f"Processing URL from XCom: {url}") - # --- End Get URL --- - - logger.info("Getting task parameters and rendering templates") - params = context['params'] # DAG run params - - # Render template fields using context - # Use render_template_as_native for better type handling if needed, else render_template - redis_conn_id = self.render_template(self.redis_conn_id, context) - service_ip = self.render_template(self.service_ip, context) - service_port_rendered = self.render_template(self.service_port, context) - account_id = self.render_template(self.account_id, context) - timeout_rendered = self.render_template(self.timeout, context) - info_json_dir = self.render_template(self.info_json_dir, context) # Rendered here for _save_info_json - - # Determine effective settings (DAG params override operator defaults) - redis_enabled = params.get('redis_enabled', self.redis_enabled) - account_id = params.get('account_id', account_id) # Use DAG param if provided - redis_conn_id = params.get('redis_conn_id', redis_conn_id) # Use DAG param if provided - - logger.info(f"Effective settings: redis_enabled={redis_enabled}, account_id='{account_id}', redis_conn_id='{redis_conn_id}'") - - host = None - port = None - - if redis_enabled: - # Get Redis connection using the helper for consistency - redis_client = _get_redis_client(redis_conn_id) - logger.info(f"Successfully connected to Redis using connection '{redis_conn_id}' for service discovery.") - - # Get service details from Redis with retries - service_key = f"ytdlp:{account_id}" - legacy_key = account_id # For backward compatibility - - for attempt in range(self.max_retries_lookup): - try: - logger.info(f"Attempt {attempt + 1}/{self.max_retries_lookup}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'") - service_details = redis_client.hgetall(service_key) - if not service_details: - logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'") - service_details = redis_client.hgetall(legacy_key) - - if not service_details: - raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}") - - # Find IP and port (case-insensitive keys) - ip_key = next((k for k in service_details if k.lower() == 'ip'), None) - port_key = next((k for k in service_details if k.lower() == 'port'), None) - - if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}") - if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}") - - host = service_details[ip_key] # Assumes decode_responses=True in hook - port_str = service_details[port_key] - - try: - port = int(port_str) - except (ValueError, TypeError): - raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}") - - logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}") - break # Success - - except Exception as e: - logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}") - if attempt == self.max_retries_lookup - 1: - logger.error("Max retries reached for fetching Redis details.") - raise AirflowException(f"Failed to get service details from Redis after {self.max_retries_lookup} attempts: {e}") - logger.info(f"Retrying in {self.retry_delay_lookup} seconds...") - time.sleep(self.retry_delay_lookup) - else: - # Direct connection: Use rendered/param values - host = params.get('service_ip', service_ip) # Use DAG param if provided - port_str = params.get('service_port', service_port_rendered) # Use DAG param if provided - - logger.info(f"Using direct connection settings: service_ip={host}, service_port={port_str}") - - if not host or not port_str: - raise ValueError("Direct connection requires service_ip and service_port (check Operator config and DAG params)") - try: - port = int(port_str) - except (ValueError, TypeError): - raise ValueError(f"Invalid service_port value: {port_str}") - - logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)") - - # Validate and use timeout - try: - timeout = int(timeout_rendered) - if timeout <= 0: raise ValueError("Timeout must be positive") - logger.info(f"Using timeout: {timeout} seconds") - except (ValueError, TypeError): - logger.warning(f"Invalid timeout value: '{timeout_rendered}'. Using default: {DEFAULT_TIMEOUT}") - timeout = DEFAULT_TIMEOUT - - # Create Thrift connection objects - # socket_conn = TSocket.TSocket(host, port) # Original - socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4) - socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds - transport = TTransport.TFramedTransport(socket_conn) # Use TFramedTransport if server expects it - # transport = TTransport.TBufferedTransport(socket_conn) # Use TBufferedTransport if server expects it - protocol = TBinaryProtocol.TBinaryProtocol(transport) - client = YTTokenOpService.Client(protocol) - - logger.info(f"Attempting to connect to Thrift server at {host}:{port}...") - try: - transport.open() - logger.info("Successfully connected to Thrift server.") - - # Test connection with ping - try: - client.ping() - logger.info("Server ping successful.") - except Exception as e: - logger.error(f"Server ping failed: {e}") - raise AirflowException(f"Server connection test (ping) failed: {e}") - - # Get token from service using the URL from XCom - try: - logger.info(f"Requesting token for accountId='{account_id}', url='{url}'") - token_data = client.getOrRefreshToken( - accountId=account_id, - updateType=TokenUpdateMode.AUTO, - url=url # Use the url variable from XCom - ) - logger.info("Successfully retrieved token data from service.") - except PBServiceException as e: - # Handle specific service exceptions - error_code = getattr(e, 'errorCode', 'N/A') - error_message = getattr(e, 'message', 'N/A') - error_context = getattr(e, 'context', {}) - logger.error(f"PBServiceException occurred: Code={error_code}, Message={error_message}") - if error_context: - logger.error(f" Context: {error_context}") # Log context separately - # Construct a concise error message for AirflowException - error_msg = f"YTDLP service error (Code: {error_code}): {error_message}" - # Add specific error code handling if needed... - logger.error(f"Failing task instance due to PBServiceException: {error_msg}") # Add explicit log before raising - raise AirflowException(error_msg) # Fail task on service error - except TTransportException as e: - logger.error(f"Thrift transport error during getOrRefreshToken: {e}") - logger.error(f"Failing task instance due to TTransportException: {e}") # Add explicit log before raising - raise AirflowException(f"Transport error during API call: {e}") - except Exception as e: - logger.error(f"Unexpected error during getOrRefreshToken: {e}") - logger.error(f"Failing task instance due to unexpected error during API call: {e}") # Add explicit log before raising - raise AirflowException(f"Unexpected error during API call: {e}") - - except TTransportException as e: - # Handle connection errors - logger.error(f"Thrift transport error during connection: {str(e)}") - logger.error(f"Failing task instance due to TTransportException during connection: {e}") # Add explicit log before raising - raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}") - # Removed the overly broad except Exception block here, as inner blocks raise AirflowException - - # --- Process Token Data --- - logger.debug(f"Token data received. Attributes: {dir(token_data)}") - - info_json_path = None # Initialize - - # save_info_json is now always True - logger.info("Proceeding to save info.json (save_info_json=True).") - info_json = self._get_info_json(token_data) - if info_json and self._is_valid_json(info_json): - try: - # Pass rendered info_json_dir to helper - info_json_path = self._save_info_json(context, info_json, url, account_id, info_json_dir) - if info_json_path: - ti.xcom_push(key='info_json_path', value=info_json_path) - logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}") - else: - ti.xcom_push(key='info_json_path', value=None) - logger.warning("info.json saving failed (check logs from _save_info_json).") - except Exception as e: - logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True) - ti.xcom_push(key='info_json_path', value=None) - elif info_json: - logger.warning("Retrieved infoJson is not valid JSON. Skipping save.") - ti.xcom_push(key='info_json_path', value=None) - else: - logger.info("No infoJson found in token data. Skipping save.") - ti.xcom_push(key='info_json_path', value=None) - - - # Extract and potentially store SOCKS proxy - # get_socks_proxy and store_socks_proxy are now always True - socks_proxy = None - logger.info("Attempting to extract SOCKS proxy (get_socks_proxy=True).") - proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None) - if proxy_attr: - socks_proxy = getattr(token_data, proxy_attr) - if socks_proxy: - logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}") - # Always store if found (store_socks_proxy=True) - ti.xcom_push(key='socks_proxy', value=socks_proxy) - logger.info("Pushed 'socks_proxy' to XCom.") - else: - logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty.") - # Store None if attribute found but empty - ti.xcom_push(key='socks_proxy', value=None) - logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.") - else: - logger.info("No SOCKS proxy attribute found in token data.") - # Store None if attribute not found - ti.xcom_push(key='socks_proxy', value=None) - logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.") - - -# --- Removed old logic block --- -# # Extract and potentially store SOCKS proxy -# socks_proxy = None -# get_socks_proxy = params.get('get_socks_proxy', self.get_socks_proxy) -# store_socks_proxy = params.get('store_socks_proxy', self.store_socks_proxy) -# -# if get_socks_proxy: -# proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None) -# if proxy_attr: -# socks_proxy = getattr(token_data, proxy_attr) -# if socks_proxy: -# logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}") -# if store_socks_proxy: -# ti.xcom_push(key='socks_proxy', value=socks_proxy) -# logger.info("Pushed 'socks_proxy' to XCom.") -# else: -# logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty.") -# if store_socks_proxy: ti.xcom_push(key='socks_proxy', value=None) -# else: -# logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found.") -# if store_socks_proxy: ti.xcom_push(key='socks_proxy', value=None) -# else: -# logger.info("get_socks_proxy is False. Skipping proxy extraction.") -# if store_socks_proxy: ti.xcom_push(key='socks_proxy', value=None) -# --- End Removed old logic block --- - - - # Get the original command from the server, or construct a fallback - ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None) - if ytdlp_cmd: - logger.info(f"Original command received from server: {ytdlp_cmd[:100]}...") # Log truncated - else: - logger.warning("No 'ytdlpCommand' attribute found in token data. Constructing a fallback for logging.") - # Construct a representative command for logging purposes - if socks_proxy: - ytdlp_cmd = f"yt-dlp --dump-json --proxy \"{socks_proxy}\" \"{url}\"" - else: - ytdlp_cmd = f"yt-dlp --dump-json \"{url}\"" - logger.info(f"Constructed fallback command: {ytdlp_cmd}") - - # Push the command to XCom - ti.xcom_push(key='ytdlp_command', value=ytdlp_cmd) - logger.info("Pushed command to XCom key 'ytdlp_command'.") - - # No explicit return needed, success is implicit if no exception raised - - except (AirflowSkipException, AirflowFailException) as e: - logger.info(f"Task skipped or failed explicitly: {e}") - raise # Re-raise to let Airflow handle state - except AirflowException as e: # Catch AirflowExceptions raised explicitly - logger.error(f"Operation failed due to AirflowException: {e}", exc_info=True) - raise # Re-raise AirflowExceptions to ensure task failure - except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already handled inside inner try - logger.error(f"Unhandled YTDLP Service/Transport error in outer block: {e}", exc_info=True) - logger.error(f"Failing task instance due to unhandled outer Service/Transport error: {e}") # Add explicit log before raising - raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException to fail task - except Exception as e: # General catch-all for truly unexpected errors - logger.error(f"Caught unexpected error in YtdlpOpsOperator outer block: {e}", exc_info=True) - logger.error(f"Failing task instance due to unexpected outer error: {e}") # Add explicit log before raising - raise AirflowException(f"Unexpected error caused task failure: {e}") # Wrap to fail task - finally: - if transport and transport.isOpen(): - logger.info("Closing Thrift transport.") - transport.close() - - # --- Helper Methods --- - - def _get_info_json(self, token_data): - """Safely extracts infoJson from token data.""" - return getattr(token_data, 'infoJson', None) - - def _is_valid_json(self, json_str): - """Checks if a string is valid JSON.""" - if not json_str or not isinstance(json_str, str): return False - try: - json.loads(json_str) - return True - except json.JSONDecodeError: - return False - - def _save_info_json(self, context, info_json, url, account_id, rendered_info_json_dir): - """Saves info_json to a file. Uses pre-rendered directory path.""" - try: - video_id = _extract_video_id(url) # Use standalone helper - - save_dir = rendered_info_json_dir or "." # Use rendered path - logger.info(f"Target directory for info.json: {save_dir}") - - # Ensure directory exists - try: - os.makedirs(save_dir, exist_ok=True) - logger.info(f"Ensured directory exists: {save_dir}") - except OSError as e: - logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.") - return None - - # Construct filename - timestamp = int(time.time()) - base_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json" - info_json_path = os.path.join(save_dir, base_filename) - latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy - - # Write to timestamped file - try: - logger.info(f"Writing info.json content (received from service) to {info_json_path}...") - with open(info_json_path, 'w', encoding='utf-8') as f: - f.write(info_json) - logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}") - except IOError as e: - logger.error(f"Failed to write info.json to {info_json_path}: {e}") - return None - - # Write to latest.json (overwrite) - best effort - try: - with open(latest_json_path, 'w', encoding='utf-8') as f: - f.write(info_json) - logger.info(f"Updated latest.json file: {latest_json_path}") - except IOError as e: - logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}") - - return info_json_path - - except Exception as e: - logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True) - return None - - -# ============================================================================= -# DAG Definition -# ============================================================================= - -default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 1, # Default retries for tasks like queue management - 'retry_delay': timedelta(minutes=1), - 'start_date': days_ago(1), - # Add concurrency control if needed for sequential processing - # 'concurrency': 1, # Ensure only one task instance runs at a time per DAG run - # 'max_active_runs': 1, # Ensure only one DAG run is active -} - -# Define DAG -# -# --- DAG Block Deactivated on 2025-07-16 --- -# This DAG has been replaced by the Sensor/Worker pattern implemented in: -# - ytdlp_sensor_redis_queue.py (polls the queue) -# - ytdlp_worker_per_url.py (processes a single URL) -# This code is kept for reference but is not active. -# diff --git a/airflow/dags-backup/ytdlp_service_dag.py b/airflow/dags-backup/ytdlp_service_dag.py deleted file mode 100644 index 224811a..0000000 --- a/airflow/dags-backup/ytdlp_service_dag.py +++ /dev/null @@ -1,974 +0,0 @@ -""" -DAG to deploy and manage YTDLP token service. - -This DAG handles the deployment, monitoring, and cleanup of a YTDLP token service -for a given account. It supports both Redis-based service discovery and direct -connection via manually specified host and port. - -Configuration Options: -- account_id: (Required) The account ID for which the service is being deployed. -- proxy: (Optional) The proxy to use for the service. -- redis_enabled: (Optional, default=True) Whether to use Redis for service discovery. - If False, you must provide `host` and `port` manually. -- host: (Optional) The host IP of the service. Required if `redis_enabled=False`. -- port: (Optional) The port of the service. Required if `redis_enabled=False`. - -Usage: -1. Redis-based service discovery: - - Set `redis_enabled=True` (default). - - Ensure Redis is configured in Airflow connections. - - The DAG will automatically discover the service IP and port from Redis. - -2. Manual host and port: - - Set `redis_enabled=False`. - - Provide `host` and `port` manually in the DAG configuration. - - Example: {"host": "192.168.1.100", "port": 9090}. - -Example Trigger Configuration: -{ - "account_id": "test_account", - "proxy": "socks5://proxy.example.com:1080", - "redis_enabled": False, - "host": "192.168.1.100", - "port": 9090 -} -""" - -from airflow import DAG -from airflow.models.param import Param -from airflow.operators.empty import EmptyOperator -from airflow.operators.python import PythonOperator -# HttpSensor is no longer used -# from airflow.providers.http.sensors.http import HttpSensor -from airflow.utils.trigger_rule import TriggerRule -from airflow.hooks.base import BaseHook -from airflow.exceptions import AirflowException -from typing import Sequence # Add Sequence for type hinting -from datetime import datetime, timedelta -from airflow.utils.dates import days_ago # Add this import -import uuid -import os -import logging -import shutil -import docker -import uuid -import redis -import requests -import socket -import time -import sys # Import sys for maxsize -from airflow.configuration import conf # Import conf - -# Import and apply Thrift exceptions patch -try: - # Always apply the patch, regardless of environment - from thrift_exceptions_patch import patch_thrift_exceptions - patch_thrift_exceptions() - logging.info("Applied Thrift exceptions patch for Airflow compatibility") - - # Verify the patch was applied correctly - try: - from pangramia.yt.exceptions.ttypes import PBServiceException - test_exception = PBServiceException(message="Test") - # Try to modify attributes to verify patch works - test_exception.args = ("Test",) - test_exception.message = "Modified test" - logging.info("Verified Thrift exception patch is working correctly") - except Exception as verify_error: - logging.error(f"Thrift exception patch verification failed: {verify_error}") - logging.error("This may cause 'immutable instance' errors during error handling") -except ImportError as e: - logging.warning(f"Could not import thrift_exceptions_patch: {e}") - logging.warning("Airflow compatibility will be affected - expect 'immutable instance' errors") -except Exception as e: - logging.error(f"Error applying Thrift exceptions patch: {e}") - -# Default arguments for the DAG -default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 0, # Disable retries for all tasks in this DAG - 'retry_delay': timedelta(minutes=5), - # Removed 'queue': 'auth_queue' to use the default queue - # Optional: Further filter workers by tags if using CeleryExecutor - 'executor_config': {"CeleryExecutor": {"tags": ["auth_node"]}}, -} - -def get_redis_connection(redis_host=None, redis_port=None): - """Get a Redis connection using Airflow's Redis connection or manually specified host/port.""" - if redis_host and redis_port: - # Use manually specified host and port - return redis.Redis( - host=redis_host, - port=redis_port, - db=0, - decode_responses=True - ) - else: - # Use Airflow's Redis connection - redis_conn = BaseHook.get_connection("redis_default") - # Use the password from the connection if available, otherwise use 'airflow' as default - password = redis_conn.password or 'airflow' - return redis.Redis( - host=redis_conn.host, # 'redis' (service name in docker-compose) - port=redis_conn.port, # 6379 - password=password, - db=0, - decode_responses=True - ) - -def get_free_port(): - """Find and return a free port.""" - import socket - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(('0.0.0.0', 0)) - return s.getsockname()[1] - -def is_port_free(p): - """Check if a port is free to use.""" - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - try: - s.bind(('0.0.0.0', p)) - return True - except OSError: - return False - -def store_account_metadata(account_id, ip, port, proxy=None, health_port=None, container_id=None): - """Store account metadata in Redis.""" - redis_client = get_redis_connection() - try: - # Verify Redis connection - if not redis_client.ping(): - raise ConnectionError("Failed to connect to Redis") - - # Store main account metadata - mapping = { - "ip": ip, - "port": str(port), - "status": "running", - "start_time": str(time.time()) - } - if proxy: - mapping["proxy"] = proxy - if health_port: - mapping["health_port"] = str(health_port) - if container_id: - mapping["container_id"] = container_id - - # Use pipeline for atomic operations - with redis_client.pipeline() as pipe: - # Store main metadata - pipe.hset(f"ytdlp:{account_id}", mapping=mapping) - # Set expiration (1 week) - pipe.expire(f"ytdlp:{account_id}", 604800) - # Add to account list - pipe.sadd("ytdlp:accounts", account_id) - # Execute all commands - results = pipe.execute() - - # Verify all commands succeeded - if not all(results): - raise RuntimeError(f"Failed to store metadata for {account_id}. Pipeline results: {results}") - - # Verify the data was actually stored - stored_data = redis_client.hgetall(f"ytdlp:{account_id}") - if not stored_data: - raise RuntimeError(f"Failed to verify stored data for {account_id}") - - logging.info(f"Successfully stored account metadata for {account_id} in Redis: {stored_data}") - return True - except Exception as e: - logging.error(f"Failed to store account metadata for {account_id}: {e}", exc_info=True) - # Attempt cleanup if storage failed - try: - redis_client = get_redis_connection() # Ensure client is available - redis_client.delete(f"ytdlp:{account_id}") - redis_client.srem("ytdlp:accounts", account_id) - except Exception as cleanup_error: - logging.error(f"Failed to cleanup failed storage for {account_id}: {cleanup_error}") - raise - -# Removed get_account_metadata function as the service now handles Redis registration checks. - -def prepare_and_deploy_service(**context): - """Prepare deployment and deploy the Docker service.""" - # Retrieve account_id, proxy, clients, and other parameters from DAG run configuration (conf) - # Set default values for account_id, proxy, and redis_enabled - account_id = context['dag_run'].conf.get('account_id') or context['params'].get('account_id', 'account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09') - proxy = context['dag_run'].conf.get('proxy') or context['params'].get('proxy', 'socks5://sslocal-rust-1084:1084') - clients = context['dag_run'].conf.get('clients') or context['params'].get('clients', 'ios,android,mweb') - redis_enabled = context['dag_run'].conf.get('redis_enabled', False) # Default to False - host_param = context['dag_run'].conf.get('host') # Host parameter from config - port_param = context['dag_run'].conf.get('port') # Port parameter from config - docker_network = context['dag_run'].conf.get('docker_network') or context['params'].get('docker_network', 'airflow_prod_proxynet') - host_external_ip_env = os.getenv('HOST_EXTERNAL_IP') # Explicit external IP from environment - - if not account_id: - raise ValueError("Account ID is missing.") - - # --- Port Determination --- - # Assign a free port if not provided, or validate the provided one - if not port_param: - port = get_free_port() - if not is_port_free(port): - raise ValueError(f"Assigned port {port} is already in use") - logging.info(f"No port provided, assigned free port: {port}") - else: - port = int(port_param) - if not is_port_free(port): - raise ValueError(f"Provided port {port} is already in use") - logging.info(f"Using provided port: {port}") - - # Determine health port - health_port = port + 1 - if not is_port_free(health_port): - raise ValueError(f"Health port {health_port} (derived from port {port}) is already in use") - logging.info(f"Using health port: {health_port}") - - - # --- Host Determination --- - # host_for_registration: IP/Host for client discovery (Redis/Logs) - # host_for_sensor: Hostname/IP for Airflow HttpSensor health check - - host_for_registration = host_param # Start with the parameter value - - if redis_enabled: - # If Redis is enabled, registration host should ideally be externally reachable - if not host_for_registration: - host_for_registration = host_external_ip_env # Use external IP from env var if available - if not host_for_registration: - # If no env var, try fetching external IP using requests - try: - logging.info("HOST_EXTERNAL_IP not set. Attempting to fetch external IP from api.ipify.org...") - response = requests.get('https://api.ipify.org', timeout=10) # 10 second timeout - response.raise_for_status() # Raise exception for bad status codes - host_for_registration = response.text.strip() - if not host_for_registration: # Check if response was empty - raise ValueError("Received empty response from api.ipify.org") - logging.info(f"Successfully fetched external IP: {host_for_registration}") - except requests.exceptions.RequestException as e: - logging.warning(f"Failed to fetch external IP: {e}. Falling back to Docker bridge IP.") - # Fallback to default Docker bridge IP if fetching fails - host_for_registration = "172.17.0.1" - logging.warning(f"Defaulting registration host to Docker bridge IP: {host_for_registration}. Ensure clients can reach this IP.") - except Exception as e: - logging.error(f"Unexpected error fetching external IP: {e}. Falling back to Docker bridge IP.") - host_for_registration = "172.17.0.1" - logging.warning(f"Defaulting registration host to Docker bridge IP: {host_for_registration}. Ensure clients can reach this IP.") - else: - logging.info(f"Redis enabled. Using HOST_EXTERNAL_IP environment variable for registration: {host_for_registration}") - else: - logging.info(f"Redis enabled. Using provided host parameter for registration: {host_for_registration}") - else: # Redis disabled - # If Redis is disabled, registration host defaults to 0.0.0.0 if not provided - if not host_for_registration: - host_for_registration = "0.0.0.0" - logging.warning(f"Redis disabled and no host param provided. Defaulting registration host to {host_for_registration}.") - else: - logging.info(f"Redis disabled. Using provided host parameter for registration: {host_for_registration}") - - # host_for_sensor determination will happen *after* container creation, using container name. - - logging.info(f"Preparing deployment for account {account_id}. Registration Host: {host_for_registration}, Port: {port}, Health Port: {health_port}") - - # Generate unique work ID and context directory - work_id = str(uuid.uuid4()) - context['task_instance'].xcom_push(key='work_id', value=work_id) - - context_dir = os.path.join(os.getenv('AIRFLOW_HOME', '/tmp'), 'service-data', work_id, 'context-data') - os.makedirs(context_dir, exist_ok=True, mode=0o777) - os.chmod(context_dir, 0o777) - - # Push context directory and account details to XCom - context['task_instance'].xcom_push(key='context_dir', value=context_dir) - context['task_instance'].xcom_push(key='account_id', value=account_id) - - # Deploy the Docker service - # The 'host_for_registration' variable here represents the externally accessible IP for registration/XCom. - # The service inside the container will listen on 0.0.0.0. - logging.info(f"Deploying service for account {account_id}. Registration Host: {host_for_registration}, Port: {port}") - - # Get Redis connection details ONLY if redis_enabled (for the container to register itself) - redis_host_for_container = '' - redis_port_for_container = '' - redis_password_for_container = '' - if redis_enabled: - try: - # Get connection details to pass to the container environment - redis_conn_details = get_redis_connection().connection_pool.connection_kwargs - redis_host_for_container = os.getenv('REDIS_HOST', redis_conn_details.get('host', 'redis')) - redis_port_for_container = str(os.getenv('REDIS_PORT', redis_conn_details.get('port', 6379))) - redis_password_for_container = os.getenv('REDIS_PASSWORD', redis_conn_details.get('password', '')) - logging.info(f"Redis enabled. Passing REDIS_HOST={redis_host_for_container}, REDIS_PORT={redis_port_for_container} to container.") - except Exception as e: - logging.error(f"Failed to get Redis connection details for container environment: {e}") - logging.warning("Proceeding without Redis details in container environment due to error.") - # Depending on container requirements, you might want to raise an error here instead - else: - logging.info("Redis disabled. Not passing REDIS_HOST/REDIS_PORT to container environment.") - - - # Get Docker connection details from Airflow - try: - secrets_backend = conf.get('secrets', 'backend', fallback='None') - logging.info(f"Attempting to get 'docker_hub' connection. Configured secrets backend: {secrets_backend}") - docker_conn = BaseHook.get_connection("docker_hub") - docker_username = docker_conn.login - docker_password = docker_conn.password - logging.info("Successfully retrieved 'docker_hub' connection.") - except Exception as e: - logging.error(f"Failed to retrieve 'docker_hub' connection: {e}") - # Log details about potential secrets backend issues - secrets_backend_kwargs = conf.get('secrets', 'backend_kwargs', fallback='{}') - logging.error(f"Secrets backend details: backend={secrets_backend}, kwargs={secrets_backend_kwargs}") - # Re-raise the exception to fail the task - raise - - try: - # Initialize Docker client to connect to docker-socket-proxy - client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375') - - # Authenticate with Docker Hub - client.login( - username=docker_username, - password=docker_password, - registry=docker_conn.host # Typically "https://index.docker.io/v1/" - ) - - # Generate a unique container name - container_name = f"ytdlp_service_{account_id}_{uuid.uuid4().hex[:8]}" - - # Pull the Docker image (if not already present) - client.images.pull('pangramia/ytdlp-ops-server:latest') - - # Use the configured network name (from params or default) - network_name = docker_network # Use the retrieved parameter - logging.info(f"Attempting to run container on network: {network_name}") - - # Determine if --probe flag should be added based on DAG param - exit_on_proxy_fail = context['dag_run'].conf.get('exit_on_proxy_fail', True) # Default to True if not set - command_args = [ - '--script-dir', '/app/scripts', - '--context-dir', '/app/context-data', # Use the bind mount target inside container - '--port', str(port), - '--health-port', str(health_port), - '--clients', clients, - '--timeout', '120', - '--proxy', proxy if proxy else '', - '--server-identity', account_id, # Use account_id as server identity - ] - if redis_enabled: - command_args.extend(['--redis-host', redis_host_for_container]) - command_args.extend(['--redis-port', redis_port_for_container]) - - if exit_on_proxy_fail: - command_args.append('--probe') - logging.info("Adding --probe flag to container command as exit_on_proxy_fail=True") - else: - logging.info("Not adding --probe flag to container command as exit_on_proxy_fail=False") - - # Run the Docker container with health port - container = client.containers.run( - image='pangramia/ytdlp-ops-server:latest', - command=command_args, # Use the constructed command list - environment={ - 'PYTHONUNBUFFERED': '1', # Ensure logs are not buffered - 'SERVER_PORT': str(port), # Port the service listens on *inside* the container - 'SERVER_HOST': '0.0.0.0', # Service should listen on all interfaces *inside* the container - 'ACCOUNT_ID': account_id, - # Pass Redis details *if enabled* for the service to register itself - 'REDIS_HOST': redis_host_for_container, - 'REDIS_PORT': redis_port_for_container, - 'REDIS_PASSWORD': redis_password_for_container, - # Pass PROXY_URL for health check access - 'PROXY_URL': proxy if proxy else '', - }, - ports={ - f"{port}/tcp": port, - f"{health_port}/tcp": health_port - }, - volumes={ - context_dir: {'bind': '/app/context-data', 'mode': 'rw'} - }, - network_mode=network_name, # Use the specified network variable - auto_remove=False, # Do not auto-remove the container - name=container_name, # Use a unique name - detach=True, - tty=True, - shm_size='256m', - # Updated healthcheck to test external connectivity via proxy - healthcheck={ - # Use CMD-SHELL to allow conditional logic based on PROXY_URL env var - 'test': [ - 'CMD-SHELL', - # Script checks if PROXY_URL is set, uses it with curl if yes, otherwise curls directly. - # -f: Fail silently (exit non-zero on error) - # --connect-timeout 10: Timeout for connection phase - # > /dev/null: Discard output, we only care about exit code - 'if [ -n "$PROXY_URL" ]; then ' - 'curl -f --connect-timeout 10 -x "$PROXY_URL" https://ifconfig.co > /dev/null; ' - 'else ' - 'curl -f --connect-timeout 10 https://ifconfig.co > /dev/null; ' - 'fi' - ], - 'interval': 30 * 1000000000, # Check every 30 seconds (30 * 1e9 nanoseconds) - 'timeout': 15 * 1000000000, # Timeout after 15 seconds (15 * 1e9 nanoseconds) - 'retries': 5, # Retry 5 times on failure - 'start_period': 15 * 1000000000 # Grace period of 15 seconds after start - }, - # Add labels for better identification - labels={ - 'service': 'ytdlp', - 'account_id': account_id - } - ) - - # Wait for container to be running (skip health check verification) - start_time = time.time() - while True: - container.reload() - if container.status == 'running': - break - if time.time() - start_time > 10: # 10 second timeout - raise TimeoutError("Container failed to start within 10 seconds") - time.sleep(1) - - logging.info(f"Container started: {container.id} (health check verification skipped)") - # Push container details immediately after creation using simplified keys - context['task_instance'].xcom_push(key='container_id', value=container.id) - context['task_instance'].xcom_push(key='container_name', value=container_name) - logging.info(f"Pushed container_id={container.id} and container_name={container_name} to XCom.") - - # --- Determine Host for Sensor --- - # Get the container's IP address on the specified network for the HttpSensor - try: - container.reload() # Refresh container attributes - network_settings = container.attrs.get('NetworkSettings', {}).get('Networks', {}) - if network_name in network_settings: - host_for_sensor = network_settings[network_name].get('IPAddress') - if not host_for_sensor: - raise ValueError(f"Container {container.id} has no IPAddress on network '{network_name}'") - logging.info(f"Using container IP '{host_for_sensor}' on network '{network_name}' for HttpSensor.") - else: - # Fallback or error if container not on expected network - logging.error(f"Container {container.id} is not attached to the expected network '{network_name}'. Network settings: {network_settings}") - # Option 1: Fallback to container name (might fail as observed) - # host_for_sensor = container_name - # logging.warning(f"Falling back to container name '{host_for_sensor}' for sensor.") - # Option 2: Raise error - raise ValueError(f"Container {container.id} not found on network '{network_name}'. Cannot determine IP for sensor.") - - except Exception as e: - logging.error(f"Failed to get container IP address: {e}", exc_info=True) - raise AirflowException(f"Failed to determine IP address for HttpSensor: {e}") - - # Ensure we don't use 0.0.0.0 or empty string for the sensor - if not host_for_sensor or host_for_sensor == "0.0.0.0": - raise ValueError(f"Determined host_for_sensor is invalid ('{host_for_sensor}'). Check container network attachment and IP assignment.") - - # --- Add extra logging before pushing --- - logging.info(f"FINAL CHECK before XCom push:") - logging.info(f" Account ID: {account_id}") - logging.info(f" Host for Sensor (IP Address): {host_for_sensor}") - logging.info(f" Host for Registration: {host_for_registration}") - logging.info(f" Service Port: {port}") - logging.info(f" Health Port: {health_port}") - logging.info(f" Pushing to XCom key: service_host with value: {host_for_sensor}") - # --- End extra logging --- - - # Push distinct service connection details using simplified keys - context['task_instance'].xcom_push(key='service_host_registration', value=host_for_registration) # For client discovery (e.g., Redis) - context['task_instance'].xcom_push(key='service_host', value=host_for_sensor) # IP Address for HttpSensor - context['task_instance'].xcom_push(key='service_port', value=port) # Port is the same - context['task_instance'].xcom_push(key='service_health_port', value=health_port) # Health port is the same - logging.info(f"Pushed host_for_sensor (IP Address)={host_for_sensor} to XCom key 'service_host'") - logging.info(f"Pushed host_for_registration={host_for_registration} to XCom key 'service_host_registration'") - - - # Store account metadata in Redis only if redis_enabled is True - # This uses the 'host_for_registration' for client discovery - if redis_enabled: - store_account_metadata(account_id, host_for_registration, port, proxy, health_port, container.id) - - # If we reach here, deployment is considered successful for now - logging.info("Deployment preparation successful.") - # Return values are implicitly pushed to XCom (but we pushed explicitly above) - return context_dir, host_for_registration, port - - except Exception as e: - logging.error(f"Error during service deployment: {e}", exc_info=True) - # Attempt to cleanup the container if it was created before the error - try: - if 'container' in locals() and container and container.id: - logging.warning(f"Attempting to stop and remove container {container.id} due to deployment error.") - container.stop(timeout=5) - container.remove(force=True) - logging.info(f"Successfully stopped and removed container {container.id} after error.") - elif 'container_name' in locals() and container_name: - # Try finding by name if ID wasn't captured - containers = client.containers.list(filters={'name': container_name}) - if containers: - logging.warning(f"Attempting to stop and remove container {containers[0].name} by name due to deployment error.") - containers[0].stop(timeout=5) - containers[0].remove(force=True) - logging.info(f"Successfully stopped and removed container {containers[0].name} after error.") - except Exception as cleanup_err: - logging.error(f"Failed during post-error container cleanup: {cleanup_err}") - raise # Re-raise the original exception to fail the task - -# Removed the old monitor_health PythonOperator - -# stop_service and cleanup_service are now defined directly in the DAG below. - -def check_service_health(ti=None, **context): - """ - Periodically checks the service's /health endpoint using requests. - Acts as a long-running sentinel task. Fails if the health check fails - repeatedly or times out. - """ - # Get parameters from XCom - host_reg = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_host_registration') - host_svc = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_host') - health_port = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_health_port') - - # Determine the host to use (prioritize registration host) - host = host_reg if host_reg and host_reg != '0.0.0.0' else host_svc - if not host or not health_port: - raise AirflowException("Could not retrieve host or health_port from XCom for health check.") - - health_url = f"http://{host}:{health_port}/health" - logging.info(f"Starting health check for: {health_url}") - - # Get configuration for polling - # Use task's execution_timeout if available, otherwise default to 1 year - task_timeout = ti.task.execution_timeout or timedelta(days=365) - poke_interval = 60 # Check every 60 seconds (adjust as needed) - start_time = time.monotonic() - timeout_seconds = task_timeout.total_seconds() - consecutive_error_start_time = None # Track start time of consecutive connection errors - error_retry_window = 10 # Seconds to retry connection errors before failing - - while True: - current_time = time.monotonic() - if current_time - start_time > timeout_seconds: - raise AirflowException(f"Health check timed out after {timeout_seconds} seconds for {health_url}") - - try: - # Use a reasonable timeout for the individual request - response = requests.get(health_url, timeout=15) # 15 second request timeout - response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx) - - # Check response content if needed (optional) - # Example: Check for specific JSON content - # try: - # data = response.json() - # if data.get("status") == "healthy": - # logging.info(f"Health check successful: Status {response.status_code}") - # else: - # logging.warning(f"Health check OK (Status {response.status_code}), but content unexpected: {data}") - # except requests.exceptions.JSONDecodeError: - # logging.warning(f"Health check OK (Status {response.status_code}), but response is not valid JSON.") - - # If we got a 2xx status, log success and reset error timer if needed - if consecutive_error_start_time is not None: - logging.info(f"Connection to {health_url} recovered.") - consecutive_error_start_time = None - logging.info(f"Health check successful: Status {response.status_code} for {health_url}") - - except requests.exceptions.Timeout: - current_monotonic_time = time.monotonic() - if consecutive_error_start_time is None: - consecutive_error_start_time = current_monotonic_time - logging.warning(f"Health check request timed out for {health_url}. Starting {error_retry_window}s retry window...") - else: - elapsed_error_time = current_monotonic_time - consecutive_error_start_time - if elapsed_error_time > error_retry_window: - error_msg = f"Health check failed for {health_url}: Timeout persisted for over {error_retry_window} seconds." - logging.error(error_msg) - raise AirflowException(error_msg) - else: - logging.warning(f"Health check request timed out for {health_url}. Retrying within {error_retry_window}s window ({elapsed_error_time:.1f}s elapsed)...") - - except requests.exceptions.ConnectionError as e: - # Check if the error is specifically "Connection refused" - fail immediately - if "[Errno 111] Connection refused" in str(e): - logging.error(f"Health check failed for {health_url}: Connection refused. Failing task immediately.") - raise AirflowException(f"Health check failed for {health_url}: Connection refused") - else: - # Handle other connection errors with the retry window - current_monotonic_time = time.monotonic() - if consecutive_error_start_time is None: - consecutive_error_start_time = current_monotonic_time - logging.warning(f"Health check connection error for {health_url}: {e}. Starting {error_retry_window}s retry window...") - else: - elapsed_error_time = current_monotonic_time - consecutive_error_start_time - if elapsed_error_time > error_retry_window: - error_msg = f"Health check failed for {health_url}: Connection error persisted for over {error_retry_window} seconds. Last error: {e}" - logging.error(error_msg) - raise AirflowException(error_msg) - else: - logging.warning(f"Health check connection error for {health_url}: {e}. Retrying within {error_retry_window}s window ({elapsed_error_time:.1f}s elapsed)...") - - except requests.exceptions.HTTPError as e: - # This catches 4xx/5xx errors - fail immediately - logging.error(f"Health check failed for {health_url}: Status {e.response.status_code}. Failing task.") - # Fail the task immediately on HTTP error - raise AirflowException(f"Health check failed for {health_url}: Status {e.response.status_code}") - except requests.exceptions.RequestException as e: - logging.error(f"Health check failed for {health_url} with unexpected error: {e}. Failing task.") - # Fail the task immediately on other request errors - raise AirflowException(f"Health check failed for {health_url}: {e}") - except Exception as e: - # Catch any other unexpected errors during the check - logging.error(f"Unexpected error during health check for {health_url}: {e}", exc_info=True) - raise AirflowException(f"Unexpected error during health check: {e}") - - # Wait for the poke interval before the next check - time.sleep(poke_interval) - - -def _wait_forever(): - """Sleeps indefinitely (or until task timeout) to simulate a running service.""" - logging.info("Sentinel task started. Sleeping in a loop...") - # Sleep in a loop with a reasonable interval to avoid OverflowError - # The task will keep running until it times out based on execution_timeout - # or is manually stopped/failed. - while True: - try: - # Sleep for a long interval (e.g., 1 day) - # You can adjust this interval if needed. - time.sleep(86400) # Sleep for 24 hours - except KeyboardInterrupt: - logging.info("Sentinel task interrupted. Exiting.") - break - except Exception as e: - # Log other potential errors during sleep, though unlikely - logging.error(f"Error during sentinel sleep loop: {e}") - # Optionally break or continue based on error handling strategy - break # Exit loop on unexpected error - -def stop_service(**context): - """Stop the running Docker container with verification.""" - # Retrieve account_id from params or kwargs - account_id = context.get('params', {}).get('account_id') or context.get('account_id') - if not account_id: - raise ValueError("Account ID is missing.") - - # Initialize Docker client to connect to docker-socket-proxy - client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375') - - try: - # For testing, try to get container ID from environment if XCom is not available - container_id = None - if 'ti' in context: - # Use simplified XCom key - container_id = context['ti'].xcom_pull(task_ids='prepare_and_deploy', key='container_id') - - if not container_id: - # If not found in XCom, try to find container by account_id pattern (keep this fallback) - containers = client.containers.list(filters={"name": f"ytdlp_service_{account_id}"}) - if containers: - container = containers[0] - container_id = container.id - logging.info(f"Found container by name pattern: {container.name} (ID: {container_id})") - else: - logging.warning(f"No container found for account {account_id} - nothing to stop") - return - - if container_id: - # If found in XCom, stop by container ID - container = client.containers.get(container_id) - - # Verify container is running before stopping - if container.status != 'running': - logging.warning(f"Container {container_id} is not running (status: {container.status})") - return - - logging.info(f"Stopping container {container_id}...") - container.stop(timeout=10) # 10 second timeout - - # Verify container is stopped - container.reload() - if container.status == 'exited': - logging.info(f"Successfully stopped container {container_id}") - else: - logging.error(f"Container {container_id} failed to stop (status: {container.status})") - raise RuntimeError(f"Container {container_id} failed to stop") - - # Clear Redis entries only if redis_enabled is True - # Retrieve redis_enabled status from DAG run conf or params - redis_enabled = context['dag_run'].conf.get('redis_enabled', False) or context['params'].get('redis_enabled', False) - if redis_enabled: - redis_client = get_redis_connection() - try: - # Verify Redis connection - if not redis_client.ping(): - raise ConnectionError("Failed to connect to Redis") - - # Remove main metadata - redis_client.delete(f"ytdlp:{account_id}") - # Remove from accounts set - redis_client.srem("ytdlp:accounts", account_id) - logging.info(f"Successfully cleared Redis entries for account: {account_id}") - except Exception as e: - logging.error(f"Failed to clear Redis entries for account {account_id}: {e}") - # Do not raise here, allow container stop to be considered successful - # raise # Optional: re-raise if Redis cleanup failure should fail the task - - return - - logging.warning(f"No container found for account {account_id} - nothing to stop") - - except docker.errors.NotFound as e: - logging.warning(f"Container for account {account_id} not found: {e}") - except Exception as e: - logging.error(f"Failed to stop container: {e}") - raise - - -def cleanup_service(**context): - """Cleanup service resources including Redis entries and XCom data.""" - # Note: This function is now called within the manual_stop_cleanup TaskGroup - try: - # Retrieve account_id from params first, then from XCom - account_id = context['params'].get('account_id') - if not account_id: - # Try to get it from XCom - account_id = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='account_id') - if not account_id: - logging.warning("Account ID not found in params or XCom - skipping resource cleanup") - return - - # Redis cleanup (if redis_enabled=True) is handled in the 'stop_service' task. - logging.info(f"Redis cleanup for account {account_id} is handled by the 'stop_service' task if enabled.") - - # Cleanup XCom data (using simplified keys where applicable) - # Note: XCom cleanup is generally not strictly necessary but can be good practice. - # Airflow manages XCom expiry. This code doesn't actually *delete* XComs. - # To truly delete, you'd use the Airflow API or DB directly. - # We'll leave the pull calls here as they don't harm anything. - ti = context['task_instance'] - ti.xcom_pull(key='container_id', task_ids='prepare_and_deploy', include_prior_dates=True) - ti.xcom_pull(key='container_name', task_ids='prepare_and_deploy', include_prior_dates=True) - ti.xcom_pull(key='service_host_registration', task_ids='prepare_and_deploy', include_prior_dates=True) - ti.xcom_pull(key='service_host', task_ids='prepare_and_deploy', include_prior_dates=True) - ti.xcom_pull(key='service_port', task_ids='prepare_and_deploy', include_prior_dates=True) - ti.xcom_pull(key='service_health_port', task_ids='prepare_and_deploy', include_prior_dates=True) - ti.xcom_pull(key='work_id', task_ids='prepare_and_deploy', include_prior_dates=True) - ti.xcom_pull(key='context_dir', task_ids='prepare_and_deploy', include_prior_dates=True) - ti.xcom_pull(key='account_id', task_ids='prepare_and_deploy', include_prior_dates=True) # Keep account_id pull - logging.info(f"Pulled XCom data for potential cleanup logging for account: {account_id}") - - # Initialize Docker client - client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375') - container_found_and_removed = False - - # Attempt 1: Get container ID from XCom using simplified key - container_id_xcom = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='container_id') - if container_id_xcom: - logging.info(f"Attempting to remove container using XCom ID: {container_id_xcom}") - try: - container = client.containers.get(container_id_xcom) - logging.info(f"Found container {container.id} (Name: {container.name}). Removing...") - container.remove(force=True) - logging.info(f"Successfully removed container {container.id}") - container_found_and_removed = True - except docker.errors.NotFound: - logging.warning(f"Container with XCom ID {container_id_xcom} not found. Trying other methods.") - except Exception as e: - logging.error(f"Error removing container {container_id_xcom}: {e}") - - # Attempt 2: Find container by labels if not found/removed via XCom ID - if not container_found_and_removed: - logging.info(f"Attempting to find and remove container by labels: service=ytdlp, account_id={account_id}") - try: - containers = client.containers.list( - filters={'label': [f'service=ytdlp', f'account_id={account_id}']}, - all=True # Include stopped containers - ) - if containers: - for container in containers: - logging.info(f"Found container {container.id} (Name: {container.name}) by labels. Removing...") - try: - container.remove(force=True) - logging.info(f"Successfully removed container {container.id}") - container_found_and_removed = True # Mark as found even if only one is removed - except Exception as e: - logging.error(f"Error removing container {container.id} found by labels: {e}") - else: - logging.info("No containers found matching labels.") - except Exception as e: - logging.error(f"Error searching for containers by labels: {e}") - - # Attempt 3: Find container by name pattern if still not found/removed - if not container_found_and_removed: - container_name_pattern = f"ytdlp_service_{account_id}_*" - logging.info(f"Attempting to find and remove container by name pattern: {container_name_pattern}") - try: - containers = client.containers.list(filters={'name': container_name_pattern}, all=True) - if containers: - for container in containers: - logging.info(f"Found container {container.id} (Name: {container.name}) by name pattern. Removing...") - try: - container.remove(force=True) - logging.info(f"Successfully removed container {container.id}") - container_found_and_removed = True - except Exception as e: - logging.error(f"Error removing container {container.id} found by name: {e}") - else: - logging.info("No containers found matching name pattern.") - except Exception as e: - logging.error(f"Error searching for containers by name: {e}") - - if not container_found_and_removed: - logging.warning(f"Could not find or remove any container for account {account_id} using ID, labels, or name.") - - # Get context directory from XCom and remove it - context_dir = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='context_dir') - if context_dir and os.path.exists(context_dir): - shutil.rmtree(context_dir) - logging.info(f"Cleaned up working directory: {context_dir}") - except Exception as e: - logging.error(f"Error during cleanup: {e}") - raise - -# Define the DAG -with DAG( - 'ytdlp_service', - default_args=default_args, - description='Deploy YTDLP token service for ios, android, mweb', - schedule_interval=None, - start_date=days_ago(1), # Use dynamic start date for manually triggered DAG - catchup=False, - tags=['youtube', 'tokens', 'service', 'docker'], - # executor_config moved to default_args - is_paused_upon_creation=False, - params={ - 'account_id': Param( - 'account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09', - type="string", - description="Required: The account ID for which the service is being deployed." - ), - 'proxy': Param( - 'socks5://sslocal-rust-1084:1084', - type=["null", "string"], - description="Optional: The SOCKS5 proxy URL to use for the service (e.g., socks5://host:port)." - ), - 'clients': Param( - 'ios,android,mweb', - type="string", - description="Comma-separated list of client types (e.g., ios,android,mweb)." - ), - 'redis_enabled': Param( - False, - type="boolean", - description="Use Redis for service discovery? If False, host/port must be provided or will be auto-assigned." - ), - 'host': Param( - None, - type=["null", "string"], - description="Optional: Host IP for the service. If redis_enabled=False and host is not provided, defaults to '0.0.0.0'. If redis_enabled=True and host is not provided, uses HOST_EXTERNAL_IP or defaults to '0.0.0.0'." - ), - 'port': Param( - None, - type=["null", "integer"], - description="Optional: Port for the service. If None, a free port will be assigned automatically. If redis_enabled=False and a port is provided, it will be used (after checking availability)." - ), - # redis_host and redis_port parameters are removed. - # If redis_enabled=True, the DAG will use the 'redis_default' Airflow connection. - 'docker_network': Param( - 'airflow_prod_proxynet', - type="string", - description="Optional: The Docker network to attach the container to. Defaults to 'airflow_prod_proxynet'." - ), - 'exit_on_proxy_fail': Param( - True, - type="boolean", - description="Exit the service container immediately if the initial proxy test fails?" - ), - } -) as dag: - - # Task to prepare and deploy the service - prepare_and_deploy = PythonOperator( - task_id='prepare_and_deploy', - python_callable=prepare_and_deploy_service, - provide_context=True, - trigger_rule='all_success' # Keep default trigger rule for prepare_and_deploy - ) - - # Combined Health Check and Sentinel Task using PythonOperator - # This task runs for a long time, checking health periodically using the 'requests' library. - # If the health check fails repeatedly or times out, the task fails, triggering 'stop_service'. - monitor_service_health = PythonOperator( - task_id='monitor_service_health', - python_callable=check_service_health, - provide_context=True, - # Set execution timeout for the task itself (acts as the overall timeout) - execution_timeout=timedelta(days=365), # Long timeout (e.g., 1 year) - # op_kwargs can pass static config, but host/port come from XCom inside the function - # poke_interval and request timeout are handled within check_service_health - ) - monitor_service_health.doc_md = """ - ### Monitor Service Health Task (PythonOperator) - Uses a Python function to periodically check the service's `/health` endpoint using the `requests` library. - Acts as both a health check and a sentinel for the running service. - - **Pulls from XCom:** Reads `service_host_registration`, `service_host`, and `service_health_port` from the `prepare_and_deploy` task to construct the target URL. - - **Polling:** Checks the `/health` endpoint every 60 seconds. - - **Timeout:** Uses the task's `execution_timeout` (set to 1 year) as the overall maximum duration. Individual requests have a 15-second timeout. - - **Failure:** If a health check request returns a 4xx/5xx status code or encounters other request errors, the task fails immediately. If the overall `execution_timeout` is reached without a failure, the task would eventually time out and fail. - """ - - # Task to stop the service (runs if monitor_service_health fails) - stop = PythonOperator( - task_id='stop_service', - python_callable=stop_service, - provide_context=True, - trigger_rule=TriggerRule.ONE_FAILED # Run only if monitor_service_health fails - ) - stop.doc_md = """ - ### Stop Service Task - Stops the Docker container associated with the service. - - **Trigger Rule:** `one_failed` - This task only runs if the upstream `monitor_service_health` task fails. - - Pulls container ID/name from XCom or finds it using labels/name patterns. - - Clears Redis entries if `redis_enabled=True`. - """ - - # Marker task to indicate that the deployment failed - prepare_failed_marker = EmptyOperator( - task_id='prepare_failed_marker', - trigger_rule=TriggerRule.ONE_FAILED # Run only if 'prepare_and_deploy' fails - ) - - # Task to cleanup resources (runs after stop sequence OR if prepare fails) - cleanup = PythonOperator( - task_id='cleanup_service', - python_callable=cleanup_service, - provide_context=True, - trigger_rule=TriggerRule.ALL_DONE # Run after upstream (stop or prepare_failed_marker) is done - ) - cleanup.doc_md = """ - ### Cleanup Service Task - Removes the Docker container and cleans up related resources. - - **Trigger Rule:** `all_done` - Runs after the `stop_service` task finishes, whether it succeeded or failed. - - Removes the container using ID from XCom, labels, or name patterns. - - Cleans up XCom variables. - - Removes the context directory. - """ - - # Define task dependencies - # Success Path: prepare -> monitor (runs indefinitely) - # Monitor Failure Path: monitor (fails) -> stop -> cleanup - # Prepare Failure Path: prepare (fails) -> prepare_failed_marker -> cleanup - - prepare_and_deploy >> monitor_service_health - prepare_and_deploy >> prepare_failed_marker # Trigger marker if prepare fails - - monitor_service_health >> stop # Trigger stop if monitor fails - - # Cleanup is triggered after stop finishes OR after prepare_failed_marker finishes - stop >> cleanup - prepare_failed_marker >> cleanup - diff --git a/airflow/dags/.DS_Store b/airflow/dags/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T03Q@#jbG~r43G6N?2HXIX0_I`$|4(xAeeYz6NV_Q_EdS?cK39v zy2tj+8p%kJcBM!Wh?O7;QP7@<>_d)w-0U?Mdm&;dK-pDt2)8yQT8T^Y*Gvx&u~H79 zEdTY@Uw{99)&H9MOFSM$FuaYsq!h(TJp(qF(2aJFz7R4qO z4ap1^gH0?MvMCp3GgJ&U*Fhs(41;{oh!i8uIa-Wj8HbvG;Q%5b9NCDjgAG2ga#JoU zIQA=4jKQ9O<3zy;90V-mBo5>7y68)W$_S^v5H`~5p%!Nx72_Z$;tmqSJ2pDolEnmY zgLqs=xC?h*6L1gi1^O251KN)VfDYoFKzHE`(A_u$bPs6X`-K2#AATDT;q1)-;QcsP z=DY*=9iVw!06L6EfF8sY`u2RAh&pDL1w8lbw(uu>A=2``dTCKzn-X1alo=8u>X(o~DhZ!N)4)35st40HWF>!|3fyiD!w8cSQY02AE8{bK6RE7Tcacc z>0?+mlT_TyY%NowM052@Ia9JKE?IFiSf``}nOnHmRL%0bSP72R=FYmrH&g$WB`t*p{& zg-Kkm{_DL6)nn(KwxNj(i@Tb$sOlBR)hZ>z6kn|h2LyfMBB`<9l4ev%F2O{{rA(?; zb0Q1)dHLjL z7jL|MSIT^=bX<9FOBuYU?AcQG+*5|Ol%ZQox7|C+;kC(!lJe>4%crlre@nWtw3!~d zD`hnFh+}w6p$QOWC7{g>vs~WDV z(K2*3Oq>$cZCAG{EbWx$39cFhQpbh7)C?Ad6r!%myDr30b4&A8n@KK&r|I%@+*m8w zFiYCa{;ifY!_a080t{*`>6oZ~0?_yPtrD$Bwgbx)&DPZ=LLGhtNvpaRi{fVASF6;h z?{rJ{#K=gCQ*ii(TL8r?wTaQOcgI;!H?=aU@3yr{nQEr4weqH5*-nAqn$QUYP7lpK z;cJD{R3HnVTbZxUvRH+<&FpMem^@2$T!zBor%@>}ttEba%QT^|bJ2RNP-uCr0ObIc ztl4%wUMTq2Y%-@m`6Tm76%P9)VHo>;hdD>HuV->LpMml`n)P+@lFjBamoC-iLV=h2 zvy=a)Vy>j6F#;zxbf_Mu-l^Orro+oXt!O5xfBb5_y)ip!XpSnhH%B$K71hz}1pIl>WUun)bSN2N3<}gzpSMpI zowI0(Kr1y2Cc18og`Ih`r9&DWKVtq7WP9NJVHH>(AXn1`8Txx0a6~I$ODvs?wROm^b9n52OHh} zjb!Tnj)7Z0y}Kj39S@}fYq2LC31M(MgXBm<>T9G28>xJwyYGIo=Z17cyOq9^+}nty zStR%GWLOS(DUkavG0;fuYxEvy^bY(hBqimw6WcMA7Ia1=w`JH0qZ3Dni+B>_E6$?2$<2IOVX?tP7Z@+2sBMZH~| zGlWv5z&1ij9PqOIv;C;o7Q7sX2T*^;dxxVu>fh-VI2uOXeclL12T^CQcZj1=6i<6& z934g-z1}#Wwd39d|B+exM(X*m&r_UL?}q diff --git a/airflow/dags/__pycache__/ytdlp_service_test.cpython-312.pyc b/airflow/dags/__pycache__/ytdlp_service_test.cpython-312.pyc deleted file mode 100644 index 248433d50c66d5486584b2a2f936f0042e56d065..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2269 zcmaKtUu+ab7{F)$-2S=$*P~v+t6oKgOMprgfrgM!Q4^vCl?1t%Y?hm8yJc^8Gdo9n z%^fMm1bh;nfT)3(m}uw&i9YybBCo!*mV|YX7-LN2tpyDRU;O6o7BK$Vz07NTBcGx^x$!+xgtP>v;{|EuO;y#BEN56#j;jaVY-E8x9&aN>&G zlj0fhRl|DsHDp8**j!A;YdH0hR?Fb@d#IL$H3?@(24}GfvWj!qz{VA&XASF92!Nc=$Wz_11#Z}_y6`=IOb`T1clY)qLl!vrm#9=JZh^a&%|^;vVS*}g`v|%%zwCi=h=`qW}^Mh~oj|j}NiU5+}u}anC`4+=&U;*$}4|qBtY#273 z3Tzs}1(^fmR?u`^uGW2Tk}KdEHz(`{2?HBqV7@$8CjoPOI7-N9H7VTj7@1?d&@5ccXN7f{IkX~#q!U;+#*~C%&vzC6H{=>jz;U@mLQ!KY51#v*M*V<|_OIk9gfN8b zDh?1wR-A8J(Bf$!#ItkYcOYmBN9A1F$1CaJIUfEZ<0uEGI{9B%fJMK69C@ zB)2TQ1dHi)H&VmPso^Ly6lDrgelW^!h|;+zSB#3~)f`GcaTghBZDIfFxXEf%wbg}P|&ID8c$Gh_@)R3ye^%j8DJVv6;#!>MB;X{26w)y?a9zy0trKhUU z5v7{Kml*+$A64hF@IT_4-PF;yOAA>e*6MDn~=Z(PNKCx%~IV z^5r+K7dNeC^t`-~zB4c^jjioPni;7>QDHdB?~NYY@O`dyLA_vKF0AA>M(F}KtH0+A zO&v{F|-{{O%3H;ynb-nrQBaX>k5pDLuYLxwa)SzC-^Dw3KNm diff --git a/airflow/dags/__pycache__/ytdlp_token_dag.cpython-312.pyc b/airflow/dags/__pycache__/ytdlp_token_dag.cpython-312.pyc deleted file mode 100644 index d59e82008e9c9ccdf48e9a6eaffb965dfed915e3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5700 zcmb7IZEPDydY&bBxl4)Pl1Pe@EZUN7(T+qrwd2~2)5w-T(z9jPQm!%6-eScWODiq8 zyxFBbSjs0Hj8;Z_O%8X3eQgfsegIV<0X5M6$j?jA9|!agbylIx#zs^03e^AP*oOq} z2l~z~SCW0{k6Vee^Yy+n^UnJ|^UVHpINXBZ`MdvqF!^~qLjTSR-qYI5JYC`tx`Qyn zoQ6tVf#X=-r};|$g5SyeHNGSi1SijHfs$AdoxGq)rC=fGD94GBKfknu-;5Hn_VNV{1CJ}D`Yi_M$Rdl6}qe2Hx-a>^W zPC=Ri89z>wB<@_xcv6Kl)QY$ZcjF$s2WHfZvv}{?K2JxX(~;^UDcp(s*Y>aSuI@sH zq)BHjh!DxJ6zm|fzg3^x&Y=ad4;@7YH|OKfb!cZb^Zx5-j>{c*%sS;b+dndV0bY@- z3+9wwzEUAnF?9+xp)-mxwX586(KIV#q-;2?610+wx;~Th*%DSvVyY!#2U&`VW-6ng zfxoJlQ$=?&ZrZdndO85*KL#%#sK8-V@L~Tly1`-YRAzfY55QhC_9(j^u6WI+|{7(Y6R= z>YTUT(6?!3gciU~qc97>sDgQHNtV^JYRa-L7WHyjR2g%=lnKr z5bXwyfqpwE%SBBw3|ThV#IxC@=lyxTV&)Z<-q7^9JXR)+{DO(KifrmLq%1?lP-THe zp?58@ZH6Il4y@id2+3DV=%Ez8Ev`%H4Jl)#_AK*%8nQBbHZz0knZcFG2btHF`R_%a zLRVDWUK$iS;O7{z8;k>S$w4eaTrTn)GQoiEV{RL3TJq|4-ky;?<1GrEQ{&skQXBY$wjRk@Nw-wr7=D2cAU{wmanJcc;_M_T6>3w;o{g^fZFwh|}@i zJ>0yC*0MW&@LpGH4D}&1;q~rER9Te%pRt+UUwZJqUG|VX?Jv&8OnO4oX)m|)p6=So zd%1eR(?mPW9YoG@z;I;$2*c2-rg zBYJU$P|AXfL5W-Rpre+^E0s#V*?$nQLdn;{d592`%$rWxj<_D*>?7N}QZ@Bjrdn3# zPv!Ho3eD4MIgg!Th75Ss_EwbgBvneP;zrA$UNHulJfWvFfFr}STipKgRCU4*Oi&e1 zlJQz_aFAh?UNu>^sHvoE8Z}{Xuu3(XI|^vldiKib__=q+<&n3p*|DqF$1h$PeS7rG z+vm@pyLN8md@b%Q=Y{F>6KWafcsnqrS80*hfpLWbBG~-+LWS7UHKO1vWo@ArIC|vB zkyA$=GebRQ=gNniEtb@B73N#(ngV<^V7Z`+%nvmetLpjLqj@{E0GegD4%Q=AVX0*W z5clPV3V_R;PH`>iNRX1EYHyNxrBu<#P*E?zEd6Y$(zXW46R`!ScP?s+ree%6Tx(S- zsthJI98hc8lZgvzNtq;eq*=w_j`a_fi3y|6$R!;UkcM59DJki*gkfc=q?AD?DbK1@ zFPFgbcDSTh%cg9WDl%579dL?<&9m9rA|)nOiD9?0%z_LDi?U!#luW7ylxs)EOof_i zd9vw%S+*&PS|&6*rVCgXm2R2k| z`lQXPG+4i>4vS!xH!JS^R?1IARJZ?S_wANuUg&vtgaJQ@}$*!*y`DDb!R_s@BQ1hd%yQL-Bx?=y_D77 zb$4t_+3HA1G1yJ9RwePrS(NDcsmiBY7g8iQemn*_Vw;yHzMq8d90CV0!bs$@8r>D0& z>|~}HC41??+oWU>xa_Yf3$)t17%T!e{xziq9N3oj?~oLa_$~5v{_FX4-B{JWZ1tsQm`xiV~B8P4oTbMN82GMt#Q{WE027GQg;1js)RuzADw!?ErLfD+=rX8I|02*Czb)mJ*bsLxHH^-;5(204C@mG*6 zAwa5V@_ltZlHH8#UytnHjO5lMxt0A7B1e|}R-*H6ZavYznSeC6nHXG846djjwLM6j z2Auuc#>UAj4-!|GMZo1)=b!xHGH=B@Hsb^9@c}E=2G9!sTOvy2o+3WcvK-iopkVC% z=(?2Mln$*+hZ_0c{^@K(IuoLZH@8tgr^S^_FJ8sS}lRd-E5oT_M6*|HSK(hcyYj;3m)8V7<4%g+)cO_S@wd+{(#FfjB_5~ z^6iVhT{3~X??oA4%;KW&rf(NxmTH}6HKLTO6%Tu|;0g{&G-wI*I=gL4&e;qEh+U?e z+{>UkwAc1H@<2~O1?$B?4LZU-Q>P23vbE$nMiH{ON?A+M9gJ}l$@ZL6)96Ju)LSh1 z9hQ89C6^$vg$l@-^gQGtLL3ifM+v28Unwe>@N!KH>lPgaCQR!sD>GJan?773VXDTvRkona~JSxCD;f z-We~nBPAU!_d1oQh@t`I3b}`jTdIk9H&fj0%gkRHTt6Lxgu(js?pCN{j|Y`5@M7N# zd;$S@^N#|VGZnK6r;n?80nT@R+R)4J2Ci#V^s+$=rm?vb((9U1n!w5%^pBwO&){dU zBYFv~NPi#xFuWT5NLn8l{^|9f4~#m|`Iz1G)OTQ(b_?ih9$q=owMd9vJDhJ9jKE89 z7*}*~2|MBqErURD#tv8Da|C?&a8Eo3;%uJLe8&-N2@DD>u}pHjqn9$Tpf@40LqOsu zCkX|7)<~|4zROyez3jBdqIXEMyFZi#zMcHl<_Y*FMcFZGcX^G5^GV7Wa@OG%C<9tM z;q_n`Yz%=Yq3kmTWnVBH{!NEj&WQ#VA|$BWKc(+729v64_oQ(XCmJbSi1Zqf|6M)mbIWMp7}I2Y9-U`ya;zbi%QAB z(&&>Uio!5^t@J^VvO0R71%+r}=>jMh67PR&o$tFh_j7*e%eQ5A^M7(oICEP3KL}jsZU6uP diff --git a/airflow/dags/get_ip.py b/airflow/dags/get_ip.py deleted file mode 100644 index 123d644..0000000 --- a/airflow/dags/get_ip.py +++ /dev/null @@ -1,23 +0,0 @@ -import socket -import logging - -logger = logging.getLogger(__name__) - -def get_ip_address(): - """ - Get the primary IP address of the host. - This is used by Airflow workers to advertise their IP for log serving, - ensuring the webserver can reach them in a multi-host environment. - """ - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - try: - # This doesn't even have to be reachable - s.connect(('10.255.255.255', 1)) - ip_address = s.getsockname()[0] - logger.info(f"Determined host IP address as: {ip_address}") - except Exception as e: - logger.warning(f"Could not determine IP address, falling back to 127.0.0.1. Error: {e}") - ip_address = '127.0.0.1' - finally: - s.close() - return ip_address diff --git a/airflow/plugins/ytdlp_plugin.py b/airflow/plugins/ytdlp_plugin.py deleted file mode 100644 index 6929713..0000000 --- a/airflow/plugins/ytdlp_plugin.py +++ /dev/null @@ -1,56 +0,0 @@ -from airflow.plugins_manager import AirflowPlugin -from airflow.hooks.base import BaseHook -from airflow.configuration import conf -import uuid -import backoff - -class YTDLPHook(BaseHook): - def __init__(self, conn_id='ytdlp_default'): - super().__init__() - self.conn_id = conn_id - self.connection = self.get_connection(conn_id) - self.timeout = conf.getint('ytdlp', 'timeout', fallback=120) - self.max_retries = conf.getint('ytdlp', 'max_retries', fallback=3) - - @backoff.on_exception(backoff.expo, - Exception, - max_tries=3, - max_time=300) - def start_service(self, host, port, service_id, work_dir): - """Start token service as a long-running process""" - import subprocess - import os - from pathlib import Path - - # Get script path relative to Airflow home - airflow_home = os.getenv('AIRFLOW_HOME', '') - script_path = Path(airflow_home).parent / 'ytdlp_ops_server.py' - - # Ensure work directory exists - os.makedirs(work_dir, exist_ok=True) - - # Start service process - cmd = [ - 'python', str(script_path), - '--port', str(port), - '--host', host, - '--service-id', service_id, - '--context-dir', work_dir, - '--script-dir', str(Path(airflow_home) / 'dags' / 'scripts') - ] - - self.log.info(f"Starting token service: {' '.join(cmd)}") - - # Start process detached - docker_cmd = [ - 'docker-compose', '-f', 'docker-compose.yaml', - 'up', '-d', '--build', 'ytdlp-service' - ] - subprocess.run(docker_cmd, check=True) - - self.log.info(f"Token service started on {host}:{port}") - return True - -class YTDLPPlugin(AirflowPlugin): - name = 'ytdlp_plugin' - hooks = [YTDLPHook] diff --git a/airflow/ytdlp_ops_client.log b/airflow/ytdlp_ops_client.log deleted file mode 100644 index 4fc17e6..0000000 --- a/airflow/ytdlp_ops_client.log +++ /dev/null @@ -1,14 +0,0 @@ -2025-04-06 00:41:03,141 - INFO - Attempting to connect to server at 127.0.0.1:9090... -2025-04-06 00:41:03,141 - INFO - Successfully connected to server -2025-04-06 00:41:03,142 - INFO - Server connection test successful -2025-04-06 00:41:03,142 - INFO - Requesting token for URL: https://www.youtube.com/watch?v=sOlTX9uxUtM%27 -2025-04-06 00:41:17,930 - INFO - Successfully received token data from server -2025-04-06 00:41:17,938 - INFO - Valid JSON with video data: Операция "Багратион". От поражения к победе. -2025-04-06 00:41:17,944 - INFO - Successfully saved info.json to info_json_sOlTX9uxUtM_1743889277.json and latest.json to latest.json -2025-04-06 00:44:05,608 - INFO - Attempting to connect to server at 127.0.0.1:9090... -2025-04-06 00:44:05,609 - INFO - Successfully connected to server -2025-04-06 00:44:05,609 - INFO - Server connection test successful -2025-04-06 00:44:05,610 - INFO - Requesting token for URL: https://www.youtube.com/watch?v=sOlTX9uxUtM%27 -2025-04-06 00:44:18,350 - INFO - Successfully received token data from server -2025-04-06 00:44:18,357 - INFO - Valid JSON with video data: Операция "Багратион". От поражения к победе. -2025-04-06 00:44:18,364 - INFO - Successfully saved info.json to info_json_sOlTX9uxUtM_1743889458.json and latest.json to latest.json