From 3673fd7b7e3e327456ae930b4445d06d51f149fc Mon Sep 17 00:00:00 2001 From: BVishal-Geek Date: Sun, 10 May 2026 14:33:35 -0400 Subject: [PATCH] Handled data leakage issue --- .../BreastCancer_SVM_v1/chemo_model.pkl | Bin 4723 -> 10621 bytes .../chemo_model_training.py | 336 ++++++++++++------ .../BreastCancer_SVM_v1/combo_model.pkl | Bin 3259 -> 7538 bytes .../combo_model_training.py | 308 +++++++++------- .../models/BreastCancer_SVM_v1/utils.py | 194 +++++++--- 5 files changed, 554 insertions(+), 284 deletions(-) diff --git a/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl b/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl index d463dfb94c44c68ab7c97ddfd96bd38a8ca6d79d..41d5e5c7f33167b6910f0cecf52697ffd6166a0f 100644 GIT binary patch literal 10621 zcmb7K30zEF|8H8AN_Z%QP*k)@WIgjBNmP=Z=9#8jGd0bOStvq0M2sa}CHs=HJ@F*( zlh<;_l2R%vMf;x8GNmMi|CzO3%X{Dd{q()(-qY{*JHK=8`QByDH%$FF*Fr_QZO6@p z8bNGdjEM^kc!5le%4Jz|7#xhrU}3m$fQRBR3b)vWORaf)jKjk>B1cUgjmpHhxNwxx zlZ$b~%v>2tdkce4rvzbC7QSCNKzhRC`=VoY z8k-Y>N_`f^52Nx0T#QFyuqe2zj&Kl*!lMRpn5c$_VW>_xj3o%-gjmzqT+BL1z~nQi zTrM>Pmt{xg;!(IzLu!IbUw#M&lhmj(9Px<#`*2U(+*M6TnuEK!xh?sIHtEh)g^&9z z7;r^4Mk;tXvO~j!5~bku--NmpE=FUsczmva#z)cUhC(e#2PMr$;jSus@dYFwTo#<< zsvA{k{5a+*sTc#paA)VFj(gxv?uwq} zK}Y3%6$5Fr1{C>3q-@5xJO-PEu9}^-y|o=K@CnC-YP`)sxbQc{WJPDz6dIGt<6%7X z&lRf{psOdJ<%Yf(jUkCZ@e{CUe8~ZiPDOJ^sK(xmX1tCM7o!GJ_zdJ4A=C`SxGV{U z)VMx=Xy*B1{#*>h(ah5Drv?R4ac6KhR<2=w2PE27{i~=N0j!VGN~*Y=8Frp&?xz^c^K-!fFNoxg~3N;mJ&dy%SDuB2T^!@ zDj!9$6lzg;94c30OCN%0sMkN}&#tp7nrmo!8LX6Uo#3Rj{|&P1G(bM;zj8E2j*0;y z$q!`nit_z<4&Vtm95$E#vrsNF<}s%AQT?aPn{SF5(MIhIL@-1chcF(&48a^>GQvy* zCxk@^s}NQrtU=g-uo1xF6=F4W_r*xXl;70Uk+$7bQ2cz%Rlm zt5Y_fO*2|?AKcvOn_D;X1srj~%)-k>tc(}IIkzSxst|mQUurn^`AfohCa>U!^B(PR z96938G=pkzznn4^zxNV0P1N1#mYWO2X}N9#UsZyDYwi~(*mOW;BEL8&9>rG6wE3&y8_Z0$m^-=o9yVi$LTe^S#DTE{dKj^F<1^X|p#o`Upl-G?3+9b`o=_ehFXC7-Z57rWH< z6}V&DQu!gV4TkToqwcaVAdk@u~GO+P`;zuq zWOV+OzRA6lFy5cXIN`HQJ^*~tQ|*;nEpT~=)r4Wj@8AN%tq+PeRe&9q4mCcXYhcr! zx#N<q!x14nz30Xg--8qedO3HzHo zG~*Hvz6Mv~C;j!Js~CEymf5eK)CLC>&0&;&Xn~7kx!L)yZ6H5p*9O7hcZBg&b)UAc zwR!;_G29Avb-V-oY%kDU6 zUJatD-M$l~ivP)Eq0JwV^!dF8Hw8IWF(K6}HhPlWj${g;Pl*YpN(R_AK?h0+q} zGTPVE=W!KSTxA@!^>7V5oqg(R^?`RF3a(mkNc@Q~KimtiY)WZ|`sA3!S%(ThQ&~pa z(%r8?LFbDhYu{Btt$dx|Rv#_`o#)*wiZ`_p=C}5Q*^w2gC9vk9$<3^{S)e?_^7H6# zef%}>=gheAr5vOL89q#$Qa~85-0U`e^4Is!a5HKb0$qW*-991_$24&J4Ss0e|KMEb?#IVttp!qXj0JF5|DM{aw}newt>-S zOa|A&=d&zBJIAY&XJoJrGJneli&wdNrDuEstH;^s<;-Y@cw~8osWI{ObzS;tuy4Uz zxMt#2;F&-oi|5$9S+TDYzN6-l4kh=3jEaG&yGM3HpN*}qZ5xT7Z;{7pb=k&ZaHp$x z&elh7pj&AuZ!&leBQ1-sW48{2XIr*(Hok8GRawP`ta zdyvj`VC)a5R6aj?(s~9i^R570@d+0^X&Hp^oO&7x=HDZc6ZJEeZRzNMOSrZffy@f% z)LJ6$cFq9FChKzZu|jxb5V@>(LnmRp(7Iv1JBuN?KfX z`!sq^X)n3!6wn6EJiORu`-z|DE^AUFM(I5Pj(J10do`LtO7^qxi2O2`tl3k)`)n@M zhFJ`baVPQ zEVk)((V%Mhi`Ob;b!3psn`7=pz`#+s(ScM62G72RZPiKy4tLkm&UO@mz~N!{?6$uJ zE2^8v4;xWQn4j%GZg{cUdwAAJG;*QS6L1*HUAXkw?lW){p>p;5_wx#)U#N2DzM+pVX~fy3FCdedd|CO>TU3F ztx-{=TM_gez>n*=)(R~KKkd3xRRwi)tdBKPK7-f{_cO+?+X(yXUp9{;9Qh3_3bGW| z-~0sgH%&?17xM*d+x}*RD&;+NwjKYbz_$kWdKlH%Z&M{JC)zI#B1tZ)c3;|{BWS&> znOO-oEUSJz;$#gRQ;U>TM*T_#GcuDixKdNg8`|Frl;Cr|Jm%Tlg+e4QBO#>4P;@XHe07FLdyUFMo)k z*!b|>p4WBoX=6@e+=z7O>S_~@Px?q0Z{>7SPhLhIh-OAk^juQ~Z_nt;E!Jv-8tYs; zSeNUe?fnrR-@bkT-|TODo0k#aUlwJ$);_%Q9t=M1Y#bGX0}JQuRz1#H;H_sCx(>Abr%Ay%T;pe{)=GVGp$gZgiOF+b5T!YaU+u^R=X9Cj&&0x;l;*jneHy|nD zfR$fp;JD;?qwmkMAKb+A-@V{o-ZuQ z4!8|kzs*cY^D2bDm?p|<_SH?a3hz3YG-1)jJ-lMzROjAtX2W~9`B8zEkyQzN66tZ+ zMpz7z9WPxSK9czTVBPh-tpigiPnUDh<_0FZhNkuSrhi&qlnN6THTIbl=E5zSN zk8)$??{$fW#a0JH7awT?UdOZlx;e2GGOoWmVX9RC-;G=zrpi?%|MSQ)bqk01dA@g; zd-Z#ZY`CGRc-loxF`RHd=c1u;8{l`|JAW|#G7RkwraL#a!tz7O!hze%3H#d^ZaSjD zwF<_K79Y{4cS7MMi;eplTfo~t#sn;U*bID@8}fX0JK)~Vog;(CG!Vv%J=9&4IJp%% zb-Zi;Soapprq*`Eu^Qo6%3SO^`7K;gII{Pw(ObY;Q89J(UE=rWiH_m3mO8wEyYH97 zym_U--}&>aEtl%x4flhi59>5TBdcL`V=q_3i(w;ffrMhh{7$kj+Lc~=1{LowuMRr~ zy*bvZe1FNmaoth%Z0rC0{&Ec`EvXULL4!>%H2Jry;J0opb=}|==tOEXIC!oOx@{OX zba`_Pu%G$23!OxKe>R-$&q;B92xvlg?fUdr5DWjXW&f4F-%GwMtQ%f3B?AJrz>3uO z&j|ZFJ!Ys;$AmoaHa{nGD=QCL)P0_M6js55w|$GYnzX`~Ho}8(mbE}<)#&nP|0d?I zb3)#nE&5l0!D7;Q&)^)cjJHtD`(b@H8HiL+tA2w~F zYeR3m0^>&+UQHd{1;<4&&$mn!!;r2MPv$vSgEw#eEsgWufrb(rFSCf}KzXD4#X-i- zgb6zjG((-+O@VpEQLuda`&o-@o1m%A)W@Omjc^9{_?`2|YvBL-ed*ileBXaeBecr( zNKQ^Kg~o}wt44;YlP8yBb2=^ZVdTu#(o35Ox2=l|YiZ?7t>V(()vm2f!c7g`=-Cu1vX~3T5 z`c|{UwA?D(fgvY<*D7vz5-f{ZGq0(YG`P6<9g+k zGPqsCYPr3CHE`NdofjPYoG`zxQz3`moPPlJhje6{jI4r*4lN5WxU~V->M37sFaHai z^c&A~Ny`F{CmQYEFrb2PzT{8PO=>i22h)d~<6l^m3(r!vI0?5EfE%QQA$*N0n7;hC z4swVX*q=;KsWcp`en|!%Kl1!1=pdV6(QNu!%r61B%hy?@2pvEMN2Qn950hLLStpEJ% zCv%dki|i-!#j1)7r-@`oeP!kHTY)K%BpTbfx(3tK7g_n5=z7M8;hbR;eYHx)D(yq> zPMWm-ft4t2wg1*uFH>c`vo>}{)oh&dToM0d^@}N0xwgvsr6=~r?O6Lo-sQgb74|ya zaepsLnkZ^I+AP+a)7P%0!=*0wc~&BB9QSYM{X;~|<9DB)%G6NCRru#o^0Q6{7>KSF zuuZSkj}?)-_T^j?nv!eAwR^0+Y(kdVnJddZCSh|puT06U56ERY8%WCfXljXe`dAY( zV_uh7^SZIJe$=F@`V9Iw<++MVkJY5Y*HW>U94b)y;CdyLcH1|JlkhJEcT z{J$^thnijY4(kREiShckD-}Np_4ys6?%x)dp|5Q3jQgH@{}SxsU(BeyZiRB`MW?{heLpsBSoAzz=q^dBR8Y&sC(`G_stI%6d5+<#))w z_7(PWI+EpdRQk#3NS4!)ET^OL{|bLO9m#S!lI3(1$>}JP*_kU#IUPlEIx6esbQH~&5PDhcP zjv_f7MRGd+U@xbmvR+O{<+&oRlv`RX=~PG0fOtBY{<}n(fC)8&upl-!1b3sai;&d{ z4Up=EgH$FM1!K{WLI_<{bo=-@V;@(e_ zA|!==CxQV;J`y3(`KY=f`3uPt$54elIej*l;g2La%GAdGA~m@I5?JWDv#`Z%{`cug z#d3x$ua>q6T5;p&jId1X1V`Xd>YcGQ@>L(LF~Vf{%uQ>cd1SO2iQ);lRE~6eN`T z`S3rN4y~p{N5VtlG?)ye#*hv%RcJ(!BuZiU22-eh=*C9z=duMHiP*zUsIB}LIx#?R zAxec58WxEz%T<*szeqZaOtY7`z{%dm@EN2WU&-&SiO&5LN%+5Hnv`W$p zNAhT6@hL$JG;w5ZP0PZMeBN;g_H%QJ?7Yw&en*zDgv$UW=h3; z^pEtf0h&k$6QD2I4VUc3NOlt?J8Q{qx@0#G?J%J}pG!p);!u$QiY1+O=0bH3+o`sQ zW2l%aE9_;31G>NI@i891zrDoH&Jm{~`fo;ZIvx`4N%N3M4AuSEOkW%kX#kZ*6L6`B zqR!x@tPesps(^+&@BAb?6(IWnFO-w3YQRcVI4gP@uu5|FqnVn_4Aiu1yPM$cGBZ$U zU|M%!ld1p{>Piz0!uVVUP0AXny=-22F9^}Ab*PbDr2lL{Xn^huegdgtEQpGngCz<{ U^yjMUs)b2P6;~Au69}yT5A!B=f&c&j literal 4723 zcmb7I3s_C*8tz_pNE6ZtMWKs!#Wc*2f6=6}sg6j-WzD{<+FSe9+B{7HJwdUmRd)evSvbMT(W7NNeUfgDxg#8nY0Jbl*dV-#r-Kuk$RbuV5XQwjHlDn zbZ^Zm)0K31c=)vU=+ge-0W~d1X9(Z~FIAFEk_tIZ^@$)QaVRtaN$^!_6bL((N)|KR zSjs~PhSJ_jf)o;RH7=8=6*Oh5ABt5Sda%*G%j2k}6LH}Rm4Swo1iA0djr`ZH4uUV2a@i^u-F1p4vd%L#)on`B_WXGUC4 zZQBpl-1h%*enTdBCE__gRxbo})K6nA!^^b(n8Q9yKaXb8w^+<4KNr@NzbvYsYGP7d zfA{?F+umZ~zfIegCKv}|;fmY_=n24)Z(cMNl}$Q3ksjUNmJBkUjxWBDFlMrTrgl*+ znS}9W^*Mg{bq-%hW(gCqHK#1bJeZRWE@#E~rsq>&^62>+qGs`dro3R-yWlgxW6`OK zi$0s#Twen1vi)$jvT&p&o!l(_xnsHhbfRDbO_{3}3u&sqzWP*)Wn5e+<*QVL3cZYq z4nmvHman6T5K0&XJT_4-R5Ld!F(1jEGFL1{GUYJxjmOmzG&P;FiYG`pAw@4qK`c#K ziHJCoAZVUBWf{kp$@p{#IJ1Mfc2br?1rh5|M_5lDXb}r!*2o4Rb3!%@*>Gedk&Qw&23a7o zU}PL*w#Xci^+z@Y*-&I|$VMO=jf{)T4_N@R@yIY_OqJ-AstzX8_w<#s(;oe%T8E_|^F zZ-bwKF+3(JPmKJ^s|h#-+zs91a~r03Z67wqwGleXi;FcwU%+qfCgc^@RY2kB#w#O+ zKQ)|D27pg)1i$31p8Q=q^vQl%aJbMM`^_$B!~r#~~~J2vI%H><7MK>MP+gM059=g)J6{krMJ zuRz4s>cJjws^FTRue5ElZi1NwVVp;opTh+WwdZ*o?}JHU9ZS#HwHW2E@l8fS;kK7x z6FkIgyYUokjt*|L8&w1MjNaCJ-M^nhxqI#4e~h~dY69Jd=32foisv@B_ROp#4UBh7 z@{O!3hrwGb=I*)G3atF*rPi&eff=>)c4v0fLECq~y5F9E+bDm#b5ahK7CeCVtLn;Z zCEY;NeDUD^(&sQ>6{oz-wHn%9%iB5gVl#-(Sd$U_{S~9}#V?9KvVC9)+_g1zTIJO) z5c9%wE?r*(^!gJH+TrJ))(XF$UE`BE_^HwOzTG~Ox67dxR^){FoZ5Q>nAU_el?-n& z>@N=7yKc;T334|bU)R2>+9+PW$3HDZwYA{MeCxL1H=cpKfu5ChUF8P6+zXGY*8gx5 zPEd?2OMlJaC&16-Tn5bEu63)Wth$IbG23A*-Z z!ba_CH;T9C_P@f?eXHTdw>k61sUCpRZR5y*&WCVnn(y%;E)U>3XA}DItVZxEx7YTp zbGMA|k{HZkrAu_qVt2{p&N3G%Q1Upt}^>d2N{O8u1)P_Uk04jf*jg zxBmQAr`40Tz#5+b{vU73fO)U=(cN*+;Gr4IkAB@hA4W`1UEsa`4h)pPh{RK?jN&~x z7MJAjUkNpJ`}mPmJO{Jkpe; zo>2#+`9I9!x;zJ|Hl2wlv+7}AP4gZ5$9dr3>jmweH@m@%sLqbR_3Z+m9Cv>iK5`Zw zTV7ZK_}&{s|NZkbsGR;j`%=eQ7{Lw3x+|(+Kj-g!2bP?JuJP;ZR;?^FiWeB@2g_Hz z20_V;AS^z)!2ID`U9)?+B-S6L6^3# z1ZSo1;KD5RJ%{2J*fBg~;q)d;Y-mfv4$`CxrXKOkIqmhvD1QqQx(~g3`v)u-{^RpO zaTZv1YRQtMJ`HfGbj*;~GfUvbPIISgZKhag^MRB>6AO&u_1{&6m2h5xlxqoPBA5Gc ziDmS{4bSgEd#ukvHkUMQF-~$UJ;UYr0BI#{+3i6y)b)aA-rei7OJ^b4U48; zYMyw|1|ug;JMnf-KDg@_b5JwSOmm+aGblEw&S-q;yLSDu?9x?`n?>;Krgec83+d&G z@fA>hK(G|^x&Tj~yEetqwE!j@j!!!I+W7wZ`J7BnUh5&&$SuNclmWkucOOKVN1+=;imit@WO<--X0Ov;Doma z6uXrI(6Qx>tg;#QPoHOh?Z}7v&~_;t9JRf={;G!$8>YoeOAoT?jYMyUOgBZ8$d}?w zliO2EW6E>ZwlqQaLo$w(O22;y6Fb{itbJpvaSqQC*eo^IWTS4;J#4HdE$=;L{hgIz zeEPLwj&C$a;}REL_TrVd;WFBe8Zkqe=Iu6BzxnLBR zX*@l!eL3CEt@Zt|*`r;f>h}-Byo*9R_PVu#+?=WVQU|sIz2Dir$qvW99*8;7%8J&; zTJXix+ZpN%Cj(B3#`|Wf!VYsxjXd2t!UL1JFNrR{Rsbff-Qnp_Hw@E}Ad#b1T1Y5+ zY)egbt+9=Le2}xFN4c&gw!)bvesT{q>)m4pT(|tXvBFQ14yO`ZwW3rAo3x_ 1 else np.nan - cm = confusion_matrix(y_test, y_pred) - print(f"Accuracy: {acc:.4f}") - print(f"AUC-ROC: {auc:.4f}") - print("\nConfusion Matrix:\n", cm) - print("\nClassification Report:\n", classification_report(y_test, y_pred)) - - # ------------------------- - # SAVE BEST MODEL - # ------------------------- - if auc > best_auc: # <-- CHANGE TO acc > best_acc IF YOU WANT ACCURACY - best_auc = auc - best_model = model - best_cm = cm - print("šŸ”„ New best model found and stored.") + acc = accuracy_score(y_val, y_pred) + auc = roc_auc_score(y_val, y_prob) if len(np.unique(y_val)) > 1 else np.nan + cm = confusion_matrix(y_val, y_pred) - fold_num += 1 + print(f"Accuracy: {acc:.4f}") + print(f"AUC-ROC: {auc:.4f}") + print(f"Confusion Matrix:\n{cm}") -print("\n==============================") -print(" BEST MODEL FROM CV") -print("==============================") -print(f"Best AUC: {best_auc:.4f}") + cv_results.append({ + 'fold': fold_num, + 'accuracy': acc, + 'auc': auc + }) -#Confusion Matrix + fold_num += 1 -print("\nConfusion Matrix:\n", best_cm) +# CV Summary +cv_df = pd.DataFrame(cv_results) +print(f"\n{'='*50}") +print("CROSS-VALIDATION SUMMARY") +print(f"{'='*50}") +print(cv_df.to_string(index=False)) +print(f"\nMean Accuracy: {cv_df['accuracy'].mean():.4f} ± {cv_df['accuracy'].std():.4f}") +if not cv_df['auc'].isna().all(): + print(f"Mean AUC: {cv_df['auc'].mean():.4f} ± {cv_df['auc'].std():.4f}") + +# ======================================== +# 9. TRAIN FINAL MODEL (on full training set) +# ======================================== +print(f"\n[8] Training final model on FULL training set...") + +final_model = build_model() +final_model.fit(X_train, y_train.ravel()) + +# ======================================== +# 10. EVALUATE ON TEST SET +# ======================================== +print(f"\n[9] Evaluating on held-out test set...") + +y_test_pred = final_model.predict(X_test) +y_test_prob = final_model.predict_proba(X_test)[:, 1] if len(np.unique(y_test)) > 1 else None + +test_acc = accuracy_score(y_test, y_test_pred) +test_auc = roc_auc_score(y_test, y_test_prob) if y_test_prob is not None and len(np.unique(y_test)) > 1 else np.nan +test_cm = confusion_matrix(y_test, y_test_pred) + +print(f"\n{'='*50}") +print("TEST SET EVALUATION") +print(f"{'='*50}") +print(f"Accuracy: {test_acc:.4f}") +print(f"AUC-ROC: {test_auc:.4f}") +print(f"Confusion Matrix:\n{test_cm}") +if len(np.unique(y_test)) > 1: + print(f"\nClassification Report:\n{classification_report(y_test, y_test_pred, target_names=response_encoder.classes_)}") + +# ======================================== +# 11. SAVE MODEL & ARTIFACTS +# ======================================== +print(f"\n[10] Saving model and preprocessing artifacts...") + +model_artifact = { + 'model': final_model, + 'response_encoder': response_encoder, + 'origin_encoder': origin_encoder, + 'iqr_bounds': iqr_bounds, + 'feature_cols': feature_cols, + 'train_patients': sorted(train_df['Patient_code'].unique()), + 'test_patients': sorted(test_df['Patient_code'].unique()), + 'cv_results': cv_df.to_dict('records'), + 'test_metrics': { + 'accuracy': test_acc, + 'auc': test_auc, + 'confusion_matrix': test_cm.tolist() + } +} -# ------------------------- -# SAVE BEST MODEL TO .pkl -# ------------------------- with open("chemo_model.pkl", "wb") as f: - pickle.dump(best_model, f) - -print("\nāœ… Chemo model saved") \ No newline at end of file + pickle.dump(model_artifact, f) + +print("Chemo model saved to: chemo_model.pkl") +print("\nModel artifact includes:") +print(" - Trained model (with fitted StandardScaler)") +print(" - Response encoder") +print(" - Origin encoder") +print(" - IQR bounds") +print(" - Feature columns") +print(" - CV results") +print(" - Test metrics") + +print("\n" + "="*70) +print("TRAINING COMPLETE") +print("="*70) \ No newline at end of file diff --git a/flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl b/flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl index c748ec6dcd05bdf98d4d809ac424798f463e9b92..9f2d93150dff91db8ad0ccabfba7017b9012b63f 100644 GIT binary patch literal 7538 zcmb7J3tUav_wR9gs8o|jCQ-~#k)}*jhRnW+#HBojMjpqx_nx{}_vPGkqmf4xm1ANY zAJ4*g=WPssul9{Fh7j_T$0$8T4`mdg-#+)%{gJ;r=VP7I-gm9F*IxU(_Hn*@31?kg zFEh=xknyIiq;er4VQ6P%yoA6>nV&+eAS7ZL!O*r7^nb2pe3BWBwvr+gN@gZ9v{njm z2|+S+PlF{%C`h@0P%6c;I6q8?Q#eCAPNr~~5GRF`wdw{}o#u%t5hf*Y8IwlaY8pyP zi2mjXToAc~b z8PS)tMXb?)qFx^lmmc<$Zn#f88h{&nS)_9dy(9KY%60&34xBx9c{-?#v3vgHw`kyA z%~5Z>eFucFS9KOw?}$OOp*aaSiK72n)SnGdFO3|#Rdn<>2TLzmL%Ygo=o@J^i)!SVA6{hz_LNcsHm4M1!5@l&`v(YI*e zTyAmeqf2C9CaA7_?=~cSIhfsbwdk4OG%%@djkQJIIgo2!*p|nqb#re3nz=h;I)Gvc z1gR9uWoR`6{J!-IU{w6&3~i}QkTP^vJ%~}u4--gmrIJvhA16-@L#wJozpIcCh*=L< ztV$-J*gupa9AS#KlqaA{u;Y^i9*oHXw#Gipr>#<@P{>KDBUk;k z6)6sE-vT8j-w1Vu{2Ni3^ghNZgTlBN>2X7?R0IrXrb% zM1VwuM2ut^l1wBUk!(Y<1Ib<_`;i<%atcX4l3Pd~At^=Dgyb!f_eiv;)=x2k5KwYb z+2K@q_RHCXk8tjc%sz7opDesP$g2hT^tGCz6uk$#oQ|drcB_ZeuGvk@x2=P|XP4(zj~#s)z3IZKNtUzRpI{lI^?}NHNDpT037V&Gv5B6=kSGq+8QJ*fwRG| zBd@v*8$0&W3wYyPQu7{OJ@gOQackPKD)@NN!Z+}j7T}*G z-Tby+i^+anIlkM4U(|zNgAXlUw4?!Mhs_#W`F(r5&&I^|pRnc;*peihJ=LMkWWSL& z7ClqHwBzoP{+J|cY=GnLS?-mstAf8f+m!|QJcTf6=CMwZuiy{4zwoEzbGWAFH%*yv zKW0-AC^%bn&1^t1{Ni%&BY!8&wH;c?T>_j@KVL zTWVVlxUC*BE2fyfZVf{&uU@>ymfJbIWmaTRGrUM0HN$Tf!8zRPZ+1tO!9MBx8%C{p z3hRAeg@#9)o4jrt(pwHUPAUb=s(5Rm>smM}h<@Pfc|#M8N?F&o4KgXWWSj=W3M@kD~H156xZP|>tK;DXGX;JTG(^-Pxzf958x`r zx!0c7r65N#cu|mNB{X%PJRP#fY;8jU^v(`DTDbl)j9;y~6ztgmXZ-C};&S65oalP# z*x0XXpj&ls+4SUcll@*&SGZ)qehg0+cWv4oSOMQ(i)ez|>I^tr7q8^ZJ8=)V3w#1= zo;^0%ua{r&sN9#;FgrS~##pF1VuMNOc4_k$a&bLwGgTym&% za-|{O0r{IB&Mz$kZRbxXsCtWsEwi`TS3&yuNpr5#?e*rB`SS9- zONC%=?p&}$Rbz5~XJ-6$bVWf448D_eJ@NWo7!tLTH*C<~(5dT$t0Vi~hn-^L`aG1M zfooR2ONa_Iecd=`zJKHCdl{VXT9DQI-ANEQzJB_~oh7jM-Zek$&$$AAO9Uh>FzVpzAP1$h@53L0S51&cdo;%6quyE56@ z$$x7;JW#ZBRDp9o>?!VMU7q?JemyB5=j4te_*`JVe&CHJFcHMK#M~-08D~F-46w*? zd<3|{y~`#(sskzVZi7?3%AsiU*n=7N6>!qN__r&GQV=-qWN^jAhbH^Y8Y7?Q@t-!} z@4J54sH_&Ce0i>QU|J2_HuvzIfRt+Zx@2An8r0Xn0C3l=Kb`ys+;a7~X9<7Ol7q&F(t)@%yLBBTo)rNH|IZ*BY6KDeqCse#@PSG_e(#@c7l5)zuwF&Z{L44Ur}}#`h?ehD|Wz+u%D|`dYG*(ilR3X zt%*>DOEC6>y`xVqb6$V#Czn6^L@iuWn(pJG_^Y?S81(+(rq1eIuh2idoGrLJW=2HsQd@Gz z4(lmOQaN&uufI8K!~zd)^0OJ`1--f(+C3KBck8O`VQB01{_S(!Halk8X^z3qVE*u! zgj8EYTkkL0C39rnwML+C+wa=g%gf5upr=Z+f~zsH##pW+w1XAYt8b~eJMA=l0gcp0{6FfquL?q*HonsG685oTN<(?5 z&__1UjP8UXLQzYLd`zgZ10u+9`l996PDjHKVXO~^*T#O3)AoqYG<}L{jT?LHU&D^8 zgB@1~JFX6PTpjGVI@ob_u;c1r$JN13t%IFf2RpTH+0;7N8QMD78QOZifBV$I&frG} zJ40LVPX{}wZ|h)Z(9^-r(AMjH>ZwVvjZCwosHS&|xS~*&8YF0IDIt}UiA=MUC$jV{+9rk;Cz<;a~Q`yN9nZnWTJ@p7(DELgUI42BeIw_ zPx^qJ28hp1X9;+VNW@EpC^<{2YuP^9PKcO(g4IBTVP|bGCRb4k6@_R+9nsDb6uE{x0_rTc5F|lwyhR33c#^Ol1O2N{vk#r}+ zPpILHUNeTX43sDK1chgb>sHN9JeS1ya+OS|)Eu(djhc_Li%wAxxhv`}@(W|%KblK( z7STZ$+El)X(k{JG(1mOrbhg@wcA(Qim(h?$c`;T*^gDZW(m@xpwUUh{&>54_ zkN0M)Kh&3Ib2VLThzj77@NBZ6-D!8kv26S#6Uw^n`X~WM;276Dbus zs|1ew@tUP;jjfK{S@KPN{2qgqo!&|ur!xw9e7Nm`x7^sscIdgb@y&@J8Y8JK4u|Y4VIZo`&mA!eeH!t?~4SNe_Z-BjxMmK_X)a+xJ6j!j& z)<9HLfL6f$LtI|hb|NFqrHM;LJ5nT$>RN%LV-cAK`Q9{Vg1>(-D*Orl0fV&2{{r+WSaftLPu7)el074jM%%L6KsCwsLH7fj~v#X!Fn_#Iq&Yqa`_0n{L_?R|%NW zi(9ne(avbkh*fEhbEG)(F`qrwME`i^JTAdL&3I-6p`tl5IXdm2gTkhID&xfp1tIht qz=g5}ll~TaM>wb|c`0%Oyfa<@a9}hTmlX literal 3259 zcmb7G4OA4@6<%0a)?EatKZyv9#wHP=3tLp0Hpe%zBqrGc8lnSesxxr;Vje>(9|hXhLHZ(JFzc$KKhUMNF{J zIb7z=zI*R`_q+Fg@2yYx$^K+RPIfvhF7ltf9! z0g0@HVTvR(yoV7zlAG~iktoxKJcu|T3T`Y(9A9ChJ&eo{C8bPPFN>-Vxg1x;$t>;1 z3{PHB%#jaC_P{v7Ed;9Jo#w@i927ClGkz@598VKvG9UB@s%>sT#5RA>Cv%J_GSx&$ zrbX-)cu5w6ZkaR?B{8B%lq7Adno0w48tRB6&P6tnXUHsPoYPuI*hxyFN3ITF%_P3` zyCnQdbk?ZEy+}$*N}j$APvoBy$;1`b0I)p1CLn4-3OrHLN=2>$w&rHBB-2%l&j*{) zfAeu64WF2bui77AX;%KkloX!U@rpv_v?{4QEiwLp55SW!2Kd-C#iV785g}eSQ;Fy9iukdw zL~H=R4s%|#Pz=X@8_Wt2$f=BN#k8Hu+2x|Vqj&j}KJzjliiS`I} zR9nXrsqXzvXyxwgHp_u((9D)X@pS!rEKYQTj)Hl>@cc=3jI?7{2(u<@5jk$6eu!zph7*zSjM0 z@$;Dr?pizoE*^dPJ`^?k`uoroc7yQ`Uh2a5x&yNpZ$dBZX=-Um7g7GMvxD-B-$Fa% zyIxJ4^IJrGD+-=zsncWcLy_1g(0V7*PY`^y+PhBYXga8piLB`5Q37nxJ1lU(lQ68VZ~I5MIJ-p z8%39wC?*fC5HTiBqmo#``27r7hz`G{J&j7DTfp8tvM>glVw43R%r9~kcV5|ZQ;BmB zXQ~p<2ZyO70rPM^Xtq06<0Mw(_)4G|;0Y^`j*D?|K2FwX!{TP38EJ{Dfh`$Ipl{wE7kgqgV~E)KZ)katfPJ}!+BsB11L@(L=tSGzlx_> z3I+m#D39by<2S9x=iFhnkvlRc0~Yxv$ipBu5HpAcBo!nLo??2!6g!v03PTcXV+1RV*SPQqFncAm@7X&BUH++exZu+My zr(Q}h3`ZF_<9i?b{rz!NVBFl?M_WHvw}$pjXKkOV+iR}19Qj43`gY6m zz4kQ&YI)i6X8Yz|z28T13e!b)zUQ;=Tt&j=1D_nudlNNfbgueU@|7XKe_Gr9t~IkC zy5BFR(P@{W{YE1nNp%0%rtL9%e-Y(c`q243~Kkq6#Ipnu6uVBiw!d`XS?nh>* zxO>d_h8pM0nA&{{)!rXsjOd|YyU>~;3;lgGVln~ti?3t6Uq zJLG@I+uuL9_#(dq+dS~wmk z-i-qz(q&wfvds%$uGzQo=NF_hXt=dT&!Mkhu(SmM*MtX#FpV6^wC002;Z%Yn-U_$mv4^Ne z_H3p9zuZuIZjhcEO3w|Y=SJ0Yqw2Y7*K?!lxoOvPqw2Yd?jnhg#0}Na$}VA-Y8Mu) SnEco;h}EQo6~fV9ZT|(mXll;@ diff --git a/flask_backend/models/BreastCancer_SVM_v1/combo_model_training.py b/flask_backend/models/BreastCancer_SVM_v1/combo_model_training.py index 44eb6323..b17d846b 100644 --- a/flask_backend/models/BreastCancer_SVM_v1/combo_model_training.py +++ b/flask_backend/models/BreastCancer_SVM_v1/combo_model_training.py @@ -1,153 +1,213 @@ - -# Breast Cancer Response Prediction (Post-treatment) combo -# This notebook trains a SVM Classifier to predict patient response based on post-treatment intervention data. +""" +Breast Cancer Response Prediction - Combination Therapy Pre-treatment +""" import warnings import pickle import pandas as pd import numpy as np -from sklearn.model_selection import KFold -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import GroupShuffleSplit +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report from sklearn.svm import SVC -from utils import remove_iqr_outliers -from sklearn.model_selection import cross_val_score, KFold -from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from sklearn.model_selection import GroupShuffleSplit +from utils import remove_iqr_outliers, apply_iqr_bounds from argparse import ArgumentParser warnings.filterwarnings("ignore") -# Load Data parser = ArgumentParser() parser.add_argument("--input", type=str, required=True) args = parser.parse_args() + +print("="*70) +print("COMBINATION THERAPY PRE-TREATMENT MODEL - FINAL") +print("="*70) + +# ======================================== +# 1. LOAD & FILTER DATA +# ======================================== +print("\n[1] Loading data...") df = pd.read_csv(args.input) -# Filter for Post_treatment df_pre_combo = df[(df['Timeline'] == 'Pre_treatment') & (df['Treatment'] == 'anti-PDL1+Chemo')].copy() -print(f"Original shape: {df.shape}, Pre-treatment shape: {df_pre_combo.shape}") +print(f"Filtered to Pre-treatment Combo: {df_pre_combo.shape[0]} cells, {df_pre_combo['Patient_code'].nunique()} patients") -# Drop redundant/irrelevant columns +# Drop unnecessary columns drop_cols = ['Tissue', 'Identifier', 'Timeline', 'defcls', 'Treatment'] +for col in ['group', 'batch', 'myleiden']: + if col in df_pre_combo.columns: + drop_cols.append(col) + df_combo = df_pre_combo.drop(columns=drop_cols) -# Encode categorical variables -le = LabelEncoder() -categorical_cols = [ 'Response'] +# ======================================== +# 2. TRAIN/TEST SPLIT BY PATIENT (FIRST!) +# ======================================== +print("\n[2] Patient-level train/test split...") -label_mappings = {} -for col in categorical_cols: - df_combo[col] = le.fit_transform(df_combo[col]) - label_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_))) +gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42) +groups = df_combo["Patient_code"] -#One Hot Encode Origin -categorical_cols = ['Origin'] -for col in categorical_cols: - encoder = OneHotEncoder(sparse_output=False) - encoded_array = encoder.fit_transform(df_combo[[col]]) - encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([col])) - df_combo = pd.concat([df_combo.drop(columns=[col]).reset_index(drop=True), encoded_df], axis=1) +train_idx, test_idx = next(gss.split(df_combo, df_combo["Response"], groups=groups)) -#Outlier Removal +train_df = df_combo.iloc[train_idx].copy() +test_df = df_combo.iloc[test_idx].copy() -feature_cols = ["Expression", 'Origin_chest_wall', 'Origin_liver', - 'Origin_lymph_node', "nGene", "percent_mito", "percent_hsp", "percent_ig", "percent_rp", "nUMI", "PDCD1"] -label_cols = ["Response"] -df_combo_ = remove_iqr_outliers(df_combo, feature_cols) +print(f"Train: {len(train_df)} cells, {train_df['Patient_code'].nunique()} patients {sorted(train_df['Patient_code'].unique())}") +print(f"Test: {len(test_df)} cells, {test_df['Patient_code'].nunique()} patients {sorted(test_df['Patient_code'].unique())}") -#Model Training +# ======================================== +# 3. ENCODE RESPONSE (TRAIN ONLY) +# ======================================== +print("\n[3] Encoding Response...") -gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42) +response_encoder = LabelEncoder() +train_df['Response'] = response_encoder.fit_transform(train_df['Response']) +test_df['Response'] = response_encoder.transform(test_df['Response']) -# groups = your patient IDs -groups = df_combo_["Patient_code"] - -train_idx, test_idx = next(gss.split(df_combo_, df_combo_["Response"], groups=groups)) - -train_df = df_combo_.iloc[train_idx] -test_df = df_combo_.iloc[test_idx] - -# ------------------------- -# Define Model -# ------------------------- -def build_model(): - return make_pipeline( - StandardScaler(), - SVC( - kernel="rbf", - C=15, - gamma=0.1, - class_weight="balanced", - probability=True, - random_state=42 - ) - ) - -X = train_df[feature_cols] -y = train_df[label_cols] - -kf = KFold(n_splits=3, shuffle=True, random_state=42) - -best_auc = -1 -best_model = None -fold_num = 1 -best_cm = None -# ------------------------- -# KFold CV Loop -# ------------------------- -for train_idx, val_idx in kf.split(X): - - print(f"\n==============================") - print(f" Fold {fold_num}") - print("==============================") - - model = build_model() - - X_tr, X_test = X.iloc[train_idx], X.iloc[val_idx] - y_tr, y_test = y.iloc[train_idx], y.iloc[val_idx] - - # Train - model.fit(X_tr, y_tr) - - # Predict - y_pred = model.predict(X_test) - y_prob = model.predict_proba(X_test)[:, 1] - - # Metrics - acc = accuracy_score(y_test, y_pred) - auc = roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) > 1 else np.nan - cm = confusion_matrix(y_test, y_pred) - print(f"Accuracy: {acc:.4f}") - print(f"AUC-ROC: {auc:.4f}") - print("\nConfusion Matrix:\n", cm) - print("\nClassification Report:\n", classification_report(y_test, y_pred)) - - # ------------------------- - # SAVE BEST MODEL - # ------------------------- - if auc > best_auc: # <-- CHANGE TO acc > best_acc IF YOU WANT ACCURACY - best_auc = auc - best_model = model - best_cm = cm - print("šŸ”„ New best model found and stored.") - - fold_num += 1 - -print("\n==============================") -print(" BEST MODEL FROM CV") -print("==============================") -print(f"Best AUC: {best_auc:.4f}") - -#Confusion Matrix - -print("\nConfusion Matrix:\n", best_cm) - -# ------------------------- -# SAVE BEST MODEL TO .pkl -# ------------------------- -with open("combo_model.pkl", "wb") as f: - pickle.dump(best_model, f) +label_mappings = dict(zip(response_encoder.classes_, response_encoder.transform(response_encoder.classes_))) +print(f"Encoding: {label_mappings}") + +# ======================================== +# 4. ONE-HOT ENCODE ORIGIN +# ======================================== +print("\n[4] One-hot encoding Origin...") + +all_origins = pd.concat([train_df['Origin'], test_df['Origin']]).unique().reshape(-1, 1) + +origin_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') +origin_encoder.fit(all_origins) + +train_origin = origin_encoder.transform(train_df[['Origin']]) +test_origin = origin_encoder.transform(test_df[['Origin']]) + +train_origin_df = pd.DataFrame(train_origin, columns=origin_encoder.get_feature_names_out(['Origin']), index=train_df.index) +test_origin_df = pd.DataFrame(test_origin, columns=origin_encoder.get_feature_names_out(['Origin']), index=test_df.index) + +train_df = pd.concat([train_df.drop(columns=['Origin']), train_origin_df], axis=1) +test_df = pd.concat([test_df.drop(columns=['Origin']), test_origin_df], axis=1) + +print(f"Origin columns: {list(origin_encoder.get_feature_names_out(['Origin']))}") -print("\nāœ… combo model saved") \ No newline at end of file +# ======================================== +# 5. OUTLIER REMOVAL (TRAIN ONLY) +# ======================================== +print("\n[5] Outlier removal...") + +feature_cols = ["Expression", 'Origin_chest_wall', 'Origin_liver', 'Origin_lymph_node', + "nGene", "percent_mito", "percent_hsp", "percent_ig", "percent_rp", "nUMI", "PDCD1"] + +train_df, iqr_bounds = remove_iqr_outliers(train_df, feature_cols) +# test_df = apply_iqr_bounds(test_df, iqr_bounds) + +# ======================================== +# 6. PREPARE FEATURES +# ======================================== +X_train = train_df[feature_cols].values +y_train = train_df['Response'].values + +X_test = test_df[feature_cols].values +y_test = test_df['Response'].values + +print(f"\n[6] Final shapes: X_train={X_train.shape}, X_test={X_test.shape}") + +# ======================================== +# 7. CROSS-VALIDATION ASSESSMENT +# ======================================== +print("\n[7] Cross-validation assessment...") + +n_patients = train_df['Patient_code'].nunique() +print(f"Training patients: {n_patients}") +print(f"Response in train: R={np.sum(y_train)}, NR={len(y_train)-np.sum(y_train)}") + +print(f"\nāš ļø DATASET TOO SMALL FOR RELIABLE CROSS-VALIDATION") +print(f"With only {n_patients} training patients, GroupKFold would create") +print(f"validation folds with insufficient samples or single-class data.") +print(f"\nCross-validation SKIPPED (methodologically correct for this dataset size).") +print(f"Model will be evaluated on held-out test set instead.") + +# ======================================== +# 8. TRAIN MODEL ON FULL TRAINING SET +# ======================================== +print(f"\n[8] Training model on full training set...") + +model = Pipeline([ + ('scaler', StandardScaler()), + ('svm', SVC(kernel="rbf", C=15, gamma=0.1, class_weight="balanced", probability=True, random_state=42)) +]) + +model.fit(X_train, y_train.ravel()) +print("āœ… Model trained") + +# ======================================== +# 9. TRAIN SET EVALUATION +# ======================================== +print(f"\n[9] Training set performance...") + +y_train_pred = model.predict(X_train) +y_train_prob = model.predict_proba(X_train)[:, 1] + +train_acc = accuracy_score(y_train, y_train_pred) +train_auc = roc_auc_score(y_train, y_train_prob) if len(np.unique(y_train)) > 1 else np.nan + +print(f"Train Accuracy: {train_acc:.4f}") +print(f"Train AUC: {train_auc:.4f}") + +# ======================================== +# 10. TEST SET EVALUATION +# ======================================== +print(f"\n[10] Test set evaluation...") + +y_test_pred = model.predict(X_test) +y_test_prob = model.predict_proba(X_test)[:, 1] if len(np.unique(y_test)) > 1 else None + +test_acc = accuracy_score(y_test, y_test_pred) +test_auc = roc_auc_score(y_test, y_test_prob) if y_test_prob is not None and len(np.unique(y_test)) > 1 else np.nan +test_cm = confusion_matrix(y_test, y_test_pred) + +print(f"\n{'='*50}") +print("TEST SET RESULTS") +print(f"{'='*50}") +print(f"Accuracy: {test_acc:.4f}") +print(f"AUC: {test_auc:.4f}") +print(f"\nConfusion Matrix:\n{test_cm}") + +if len(np.unique(y_test)) > 1: + print(f"\nClassification Report:\n{classification_report(y_test, y_test_pred, target_names=response_encoder.classes_)}") + +# ======================================== +# 11. SAVE MODEL +# ======================================== +print(f"\n[11] Saving model...") + +model_artifact = { + 'model': model, + 'response_encoder': response_encoder, + 'origin_encoder': origin_encoder, + 'iqr_bounds': iqr_bounds, + 'feature_cols': feature_cols, + 'label_mappings': label_mappings, + 'train_patients': sorted(train_df['Patient_code'].unique()), + 'test_patients': sorted(test_df['Patient_code'].unique()), + 'test_metrics': { + 'accuracy': test_acc, + 'auc': test_auc, + 'confusion_matrix': test_cm.tolist() + }, + 'note': 'CV skipped - dataset too small for reliable GroupKFold' +} + +with open("combo_model.pkl", "wb") as f: + pickle.dump(model_artifact, f) + +print("combo_model.pkl saved") + +print("\n" + "="*70) +print("TRAINING COMPLETE") +print("="*70) +print(f"\nModel Performance:") +print(f" Train: Acc={train_acc:.3f}, AUC={train_auc:.3f}") +print(f" Test: Acc={test_acc:.3f}, AUC={test_auc:.3f}") +print(f"\nLimitation: Only {n_patients} training patients") +print(f"Results are exploratory - larger cohort needed for robust predictions") \ No newline at end of file diff --git a/flask_backend/models/BreastCancer_SVM_v1/utils.py b/flask_backend/models/BreastCancer_SVM_v1/utils.py index 634d9fee..7c242c20 100644 --- a/flask_backend/models/BreastCancer_SVM_v1/utils.py +++ b/flask_backend/models/BreastCancer_SVM_v1/utils.py @@ -1,69 +1,145 @@ -import os -import sys -import numpy as np -import pandas as pd -from typing import List - -def int_conventor(df: pd.DataFrame, col_names: list [str]) -> pd.DataFrame: - ''' - A function to convert the column names from another data type to integer - :param df: data frame - :param col_names: Column names to be converted in to integer - :return: pd.dataFrame - ''' - try: - for col in col_names: - df[col] = df[col].astype(int) - return df - - except Exception as e: - raise e +""" +Utility Functions for Breast Cancer Model Training +""" -def float_conventor(df: pd.DataFrame, col_names: list[str]) -> pd.DataFrame: - ''' - A function to convert the column names from another data type to integer - :param df: data frame - :param col_names: Column names to be converted in to integer - :return: pd.dataFrame - ''' - try: - for col in col_names: - df[col] = df[col].astype(float) - return df +import pandas as pd +import numpy as np - except Exception as e: - raise e -def remove_cols(df: pd.DataFrame, col_names: list[str]) -> pd.DataFrame: - ''' - A function to remove columns from a data type - :param df: - :param col_names: - :return: pd.DataFrame - ''' +def remove_iqr_outliers(df, feature_cols, multiplier=1.5): + """ + Remove outliers using IQR method and return bounds. - try: - df = df.drop(col_names, axis=1, errors='ignore') - return df - except Exception as e: - raise e + Args: + df: DataFrame containing the data + feature_cols: List of feature column names to check for outliers + multiplier: IQR multiplier for outlier detection (default: 1.5) + + Returns: + clean_df: DataFrame with outliers removed + iqr_bounds: Dictionary of {column: (lower_bound, upper_bound)} + """ + iqr_bounds = {} + mask = pd.Series(True, index=df.index) + + for col in feature_cols: + # Skip if column doesn't exist or is not numeric + if col not in df.columns: + continue + + if df[col].dtype == 'object': + continue + + # Calculate IQR + Q1 = df[col].quantile(0.25) + Q3 = df[col].quantile(0.75) + IQR = Q3 - Q1 + + # Calculate bounds + lower = Q1 - multiplier * IQR + upper = Q3 + multiplier * IQR + + # Store bounds for later use on test set + iqr_bounds[col] = (lower, upper) + + # Create mask for this column + col_mask = (df[col] >= lower) & (df[col] <= upper) + mask = mask & col_mask + + # Apply mask + clean_df = df[mask].copy() + + print(f" IQR outlier removal: {len(df)} -> {len(clean_df)} rows ({len(df)-len(clean_df)} outliers removed)") + + return clean_df, iqr_bounds -def remove_iqr_outliers(df, cols): +def apply_iqr_bounds(df, iqr_bounds): """ - Removes outliers using IQR method for the given numeric columns. - Keeps rows where each feature is within [Q1 - 1.5*IQR, Q3 + 1.5*IQR]. + Apply pre-computed IQR bounds to filter a dataset. + + This function is used to apply training-set outlier bounds to test data, + ensuring no test data statistics leak into preprocessing. + + Args: + df: DataFrame to filter + iqr_bounds: Dictionary of {column: (lower_bound, upper_bound)} from training set + + Returns: + filtered_df: DataFrame with outliers removed based on training bounds """ - df_clean = df.copy() - - for col in cols: - Q1 = df_clean[col].quantile(0.25) - Q3 = df_clean[col].quantile(0.75) - IQR = Q3 - Q1 + mask = pd.Series(True, index=df.index) + + for col, (lower, upper) in iqr_bounds.items(): + if col in df.columns: + col_mask = (df[col] >= lower) & (df[col] <= upper) + mask = mask & col_mask + + filtered_df = df[mask].copy() + + print(f" Applied IQR bounds: {len(df)} -> {len(filtered_df)} rows ({len(df)-len(filtered_df)} outliers removed)") + + return filtered_df - lower_bound = Q1 - 1.5 * IQR - upper_bound = Q3 + 1.5 * IQR - df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)] - - return df_clean +def validate_preprocessing_pipeline(train_df, test_df, feature_cols): + """ + Validate that preprocessing was done correctly (no data leakage). + + Args: + train_df: Training dataframe + test_df: Test dataframe + feature_cols: List of feature columns + + Returns: + dict: Validation results + """ + validation = { + 'train_test_overlap': False, + 'feature_columns_match': False, + 'no_nulls_train': False, + 'no_nulls_test': False + } + + # Check for patient overlap + train_patients = set(train_df['Patient_code'].unique()) + test_patients = set(test_df['Patient_code'].unique()) + overlap = train_patients & test_patients + + if len(overlap) == 0: + validation['train_test_overlap'] = True + print("āœ… No patient overlap between train and test") + else: + print(f" Patient overlap detected: {overlap}") + + # Check feature columns + train_features = set(train_df.columns) & set(feature_cols) + test_features = set(test_df.columns) & set(feature_cols) + + if train_features == test_features: + validation['feature_columns_match'] = True + print("āœ… Feature columns match between train and test") + else: + missing_in_test = train_features - test_features + missing_in_train = test_features - train_features + if missing_in_test: + print(f" Features missing in test: {missing_in_test}") + if missing_in_train: + print(f" Features missing in train: {missing_in_train}") + + # Check for nulls + if not train_df[feature_cols].isnull().any().any(): + validation['no_nulls_train'] = True + print("āœ… No nulls in training features") + else: + null_cols = train_df[feature_cols].columns[train_df[feature_cols].isnull().any()].tolist() + print(f" Nulls found in training features: {null_cols}") + + if not test_df[feature_cols].isnull().any().any(): + validation['no_nulls_test'] = True + print("āœ… No nulls in test features") + else: + null_cols = test_df[feature_cols].columns[test_df[feature_cols].isnull().any()].tolist() + print(f" Nulls found in test features: {null_cols}") + + return validation \ No newline at end of file