diff --git a/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl b/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl index d463dfb9..41d5e5c7 100644 Binary files a/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl and b/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl differ diff --git a/flask_backend/models/BreastCancer_SVM_v1/chemo_model_training.py b/flask_backend/models/BreastCancer_SVM_v1/chemo_model_training.py index 14a152cd..722e5d7d 100644 --- a/flask_backend/models/BreastCancer_SVM_v1/chemo_model_training.py +++ b/flask_backend/models/BreastCancer_SVM_v1/chemo_model_training.py @@ -1,20 +1,17 @@ - -# Breast Cancer Response Prediction (Post-treatment) Combinational Treatment -# This notebook trains a SVM Classifier to predict patient response based on post-treatment intervention data. +""" +Breast Cancer Response Prediction - Chemo Pre-treatment +""" import warnings import pickle import pandas as pd import numpy as np -from sklearn.model_selection import KFold -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import GroupKFold, GroupShuffleSplit +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report from sklearn.svm import SVC -from utils import remove_iqr_outliers -from sklearn.model_selection import cross_val_score, KFold -from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from sklearn.model_selection import GroupShuffleSplit +from utils import remove_iqr_outliers, apply_iqr_bounds from argparse import ArgumentParser warnings.filterwarnings("ignore") @@ -23,131 +20,268 @@ parser.add_argument("--input", type=str, required=True) args = parser.parse_args() -# Load Data +print("="*70) +print("CHEMO PRE-TREATMENT MODEL TRAINING") +print("="*70) + +# ======================================== +# 1. LOAD & FILTER DATA +# ======================================== +print("\n[1] Loading and filtering data...") df = pd.read_csv(args.input) -# Filter for Post_treatment df_pre_chemo = df[(df['Timeline'] == 'Pre_treatment') & (df['Treatment'] == 'Chemo')].copy() -print(f"Original shape: {df.shape}, Pre-treatment shape: {df_pre_chemo.shape}") +print(f"Original shape: {df.shape}") +print(f"Pre-treatment Chemo shape: {df_pre_chemo.shape}") +print(f"Patients: {df_pre_chemo['Patient_code'].nunique()}") +print(f"Response distribution:\n{df_pre_chemo['Response'].value_counts()}") # Drop redundant/irrelevant columns drop_cols = ['Tissue', 'Identifier', 'Timeline', 'defcls', 'Treatment'] -df_chemo = df_pre_chemo.drop(columns=drop_cols) - -# Encode categorical variables -le = LabelEncoder() -categorical_cols = ['Response'] - -label_mappings = {} -for col in categorical_cols: - df_chemo[col] = le.fit_transform(df_chemo[col]) - label_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_))) +if 'group' in df_pre_chemo.columns: + drop_cols.append('group') +if 'batch' in df_pre_chemo.columns: + drop_cols.append('batch') +if 'myleiden' in df_pre_chemo.columns: + drop_cols.append('myleiden') -#One Hot Encode Origin -categorical_cols = ['Origin'] -for col in categorical_cols: - encoder = OneHotEncoder(sparse_output=False) - encoded_array = encoder.fit_transform(df_chemo[[col]]) - encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([col])) - df_chemo = pd.concat([df_chemo.drop(columns=[col]).reset_index(drop=True), encoded_df], axis=1) - -#Outlier Removal - -feature_cols = ["Expression", "Origin_breast", "Origin_liver", "nGene", "percent_mito", "percent_hsp", "percent_ig", "percent_rp", "nUMI", "PDCD1"] -label_cols = ["Response"] -df_chemo_ = remove_iqr_outliers(df_chemo, feature_cols) +df_chemo = df_pre_chemo.drop(columns=drop_cols) -#Model Training +# ======================================== +# 2. TRAIN/TEST SPLIT +# ======================================== +print("\n[2] Splitting train/test by patient...") gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42) - -# groups = your patient IDs -groups = df_chemo_["Patient_code"] - -train_idx, test_idx = next(gss.split(df_chemo_, df_chemo_["Response"], groups=groups)) - -train_df = df_chemo_.iloc[train_idx] -test_df = df_chemo_.iloc[test_idx] - -# ------------------------- -# Define Model -# ------------------------- +groups = df_chemo["Patient_code"] + +train_idx, test_idx = next(gss.split(df_chemo, df_chemo["Response"], groups=groups)) + +train_df = df_chemo.iloc[train_idx].copy() +test_df = df_chemo.iloc[test_idx].copy() + +# ======================================== +# 3. ENCODE RESPONSE (FIT ON TRAIN ONLY) +# ======================================== +print("\n[3] Encoding Response variable (fit on train only)...") + +response_encoder = LabelEncoder() +train_df['Response'] = response_encoder.fit_transform(train_df['Response']) +test_df['Response'] = response_encoder.transform(test_df['Response']) + +# ======================================== +# 4. ONE-HOT ENCODE ORIGIN +# FIT ON ALL UNIQUE VALUES (EXCEPTION TO THE RULE) +# ======================================== +print("\n[4] One-hot encoding Origin...") + +# Get ALL unique Origin values from both train and test +all_origins = pd.concat([train_df['Origin'], test_df['Origin']]).unique().reshape(-1, 1) + +# Fit encoder on all possible categories +origin_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') +origin_encoder.fit(all_origins) + +print(f"Origin categories: {origin_encoder.categories_[0]}") + +# Transform both train and test +train_origin_encoded = origin_encoder.transform(train_df[['Origin']]) +test_origin_encoded = origin_encoder.transform(test_df[['Origin']]) + +# Create DataFrames +train_origin_df = pd.DataFrame( + train_origin_encoded, + columns=origin_encoder.get_feature_names_out(['Origin']), + index=train_df.index +) +test_origin_df = pd.DataFrame( + test_origin_encoded, + columns=origin_encoder.get_feature_names_out(['Origin']), + index=test_df.index +) + +# Concatenate +train_df = pd.concat([train_df.drop(columns=['Origin']), train_origin_df], axis=1) +test_df = pd.concat([test_df.drop(columns=['Origin']), test_origin_df], axis=1) + +print(f"Origin columns created: {list(origin_encoder.get_feature_names_out(['Origin']))}") + +# ======================================== +# 5. OUTLIER REMOVAL (FIT ON TRAIN ONLY) +# ======================================== +print("\n[5] Removing outliers (fit on train only)...") + +feature_cols = ["Expression", "Origin_breast", "Origin_liver", "nGene", + "percent_mito", "percent_hsp", "percent_ig", "percent_rp", + "nUMI", "PDCD1"] + +print(f"Train before outlier removal: {len(train_df)} cells") +train_df, iqr_bounds = remove_iqr_outliers(train_df, feature_cols) +print(f"Train after outlier removal: {len(train_df)} cells") + +print(f"Test before outlier removal: {len(test_df)} cells") +test_df = apply_iqr_bounds(test_df, iqr_bounds) +print(f"Test after outlier removal: {len(test_df)} cells") + +# ======================================== +# 6. PREPARE FEATURES +# ======================================== +X_train = train_df[feature_cols].values +y_train = train_df['Response'].values +groups_train = train_df['Patient_code'].values + +X_test = test_df[feature_cols].values +y_test = test_df['Response'].values + +print(f"\n[6] Feature shapes:") +print(f"X_train: {X_train.shape}, y_train: {y_train.shape}") +print(f"X_test: {X_test.shape}, y_test: {y_test.shape}") + +# ======================================== +# 7. DEFINE MODEL +# ======================================== def build_model(): - return make_pipeline( - StandardScaler(), - SVC( + return Pipeline([ + ('scaler', StandardScaler()), + ('svm', SVC( kernel="rbf", C=15, gamma=0.1, class_weight="balanced", probability=True, random_state=42 - ) - ) + )) + ]) + +# ======================================== +# 8. CROSS-VALIDATION (GroupKFold) +# ======================================== +print("\n[7] Cross-validation with GroupKFold...") -X = train_df[feature_cols] -y = train_df[label_cols] +n_patients = train_df['Patient_code'].nunique() +n_folds = min(3, n_patients) +print(f"Using {n_folds}-fold GroupKFold CV") -kf = KFold(n_splits=3, shuffle=True, random_state=42) +gkf = GroupKFold(n_splits=n_folds) -best_auc = -1 -best_model = None +cv_results = [] fold_num = 1 -best_cm = None -# ------------------------- -# KFold CV Loop -# ------------------------- -for train_idx, val_idx in kf.split(X): - print(f"\n==============================") - print(f" Fold {fold_num}") - print("==============================") +for train_idx, val_idx in gkf.split(X_train, y_train, groups_train): + print(f"\n{'='*50}") + print(f"Fold {fold_num}/{n_folds}") + print(f"{'='*50}") model = build_model() - X_tr, X_test = X.iloc[train_idx], X.iloc[val_idx] - y_tr, y_test = y.iloc[train_idx], y.iloc[val_idx] + X_tr, X_val = X_train[train_idx], X_train[val_idx] + y_tr, y_val = y_train[train_idx], y_train[val_idx] + + # Get patient IDs for this fold + train_pats = train_df.iloc[train_idx]['Patient_code'].unique() + val_pats = train_df.iloc[val_idx]['Patient_code'].unique() + + print(f"Train patients: {sorted(train_pats)}") + print(f"Val patients: {sorted(val_pats)}") + print(f"Train: {len(X_tr)} cells, Val: {len(X_val)} cells") # Train - model.fit(X_tr, y_tr) + model.fit(X_tr, y_tr.ravel()) # Predict - y_pred = model.predict(X_test) - y_prob = model.predict_proba(X_test)[:, 1] + y_pred = model.predict(X_val) + y_prob = model.predict_proba(X_val)[:, 1] # Metrics - acc = accuracy_score(y_test, y_pred) - auc = roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) > 1 else np.nan - cm = confusion_matrix(y_test, y_pred) - print(f"Accuracy: {acc:.4f}") - print(f"AUC-ROC: {auc:.4f}") - print("\nConfusion Matrix:\n", cm) - print("\nClassification Report:\n", classification_report(y_test, y_pred)) - - # ------------------------- - # SAVE BEST MODEL - # ------------------------- - if auc > best_auc: # <-- CHANGE TO acc > best_acc IF YOU WANT ACCURACY - best_auc = auc - best_model = model - best_cm = cm - print("šŸ”„ New best model found and stored.") + acc = accuracy_score(y_val, y_pred) + auc = roc_auc_score(y_val, y_prob) if len(np.unique(y_val)) > 1 else np.nan + cm = confusion_matrix(y_val, y_pred) - fold_num += 1 + print(f"Accuracy: {acc:.4f}") + print(f"AUC-ROC: {auc:.4f}") + print(f"Confusion Matrix:\n{cm}") -print("\n==============================") -print(" BEST MODEL FROM CV") -print("==============================") -print(f"Best AUC: {best_auc:.4f}") + cv_results.append({ + 'fold': fold_num, + 'accuracy': acc, + 'auc': auc + }) -#Confusion Matrix + fold_num += 1 -print("\nConfusion Matrix:\n", best_cm) +# CV Summary +cv_df = pd.DataFrame(cv_results) +print(f"\n{'='*50}") +print("CROSS-VALIDATION SUMMARY") +print(f"{'='*50}") +print(cv_df.to_string(index=False)) +print(f"\nMean Accuracy: {cv_df['accuracy'].mean():.4f} ± {cv_df['accuracy'].std():.4f}") +if not cv_df['auc'].isna().all(): + print(f"Mean AUC: {cv_df['auc'].mean():.4f} ± {cv_df['auc'].std():.4f}") + +# ======================================== +# 9. TRAIN FINAL MODEL (on full training set) +# ======================================== +print(f"\n[8] Training final model on FULL training set...") + +final_model = build_model() +final_model.fit(X_train, y_train.ravel()) + +# ======================================== +# 10. EVALUATE ON TEST SET +# ======================================== +print(f"\n[9] Evaluating on held-out test set...") + +y_test_pred = final_model.predict(X_test) +y_test_prob = final_model.predict_proba(X_test)[:, 1] if len(np.unique(y_test)) > 1 else None + +test_acc = accuracy_score(y_test, y_test_pred) +test_auc = roc_auc_score(y_test, y_test_prob) if y_test_prob is not None and len(np.unique(y_test)) > 1 else np.nan +test_cm = confusion_matrix(y_test, y_test_pred) + +print(f"\n{'='*50}") +print("TEST SET EVALUATION") +print(f"{'='*50}") +print(f"Accuracy: {test_acc:.4f}") +print(f"AUC-ROC: {test_auc:.4f}") +print(f"Confusion Matrix:\n{test_cm}") +if len(np.unique(y_test)) > 1: + print(f"\nClassification Report:\n{classification_report(y_test, y_test_pred, target_names=response_encoder.classes_)}") + +# ======================================== +# 11. SAVE MODEL & ARTIFACTS +# ======================================== +print(f"\n[10] Saving model and preprocessing artifacts...") + +model_artifact = { + 'model': final_model, + 'response_encoder': response_encoder, + 'origin_encoder': origin_encoder, + 'iqr_bounds': iqr_bounds, + 'feature_cols': feature_cols, + 'train_patients': sorted(train_df['Patient_code'].unique()), + 'test_patients': sorted(test_df['Patient_code'].unique()), + 'cv_results': cv_df.to_dict('records'), + 'test_metrics': { + 'accuracy': test_acc, + 'auc': test_auc, + 'confusion_matrix': test_cm.tolist() + } +} -# ------------------------- -# SAVE BEST MODEL TO .pkl -# ------------------------- with open("chemo_model.pkl", "wb") as f: - pickle.dump(best_model, f) - -print("\nāœ… Chemo model saved") \ No newline at end of file + pickle.dump(model_artifact, f) + +print("Chemo model saved to: chemo_model.pkl") +print("\nModel artifact includes:") +print(" - Trained model (with fitted StandardScaler)") +print(" - Response encoder") +print(" - Origin encoder") +print(" - IQR bounds") +print(" - Feature columns") +print(" - CV results") +print(" - Test metrics") + +print("\n" + "="*70) +print("TRAINING COMPLETE") +print("="*70) \ No newline at end of file diff --git a/flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl b/flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl index c748ec6d..9f2d9315 100644 Binary files a/flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl and b/flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl differ diff --git a/flask_backend/models/BreastCancer_SVM_v1/combo_model_training.py b/flask_backend/models/BreastCancer_SVM_v1/combo_model_training.py index 44eb6323..b17d846b 100644 --- a/flask_backend/models/BreastCancer_SVM_v1/combo_model_training.py +++ b/flask_backend/models/BreastCancer_SVM_v1/combo_model_training.py @@ -1,153 +1,213 @@ - -# Breast Cancer Response Prediction (Post-treatment) combo -# This notebook trains a SVM Classifier to predict patient response based on post-treatment intervention data. +""" +Breast Cancer Response Prediction - Combination Therapy Pre-treatment +""" import warnings import pickle import pandas as pd import numpy as np -from sklearn.model_selection import KFold -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import GroupShuffleSplit +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report from sklearn.svm import SVC -from utils import remove_iqr_outliers -from sklearn.model_selection import cross_val_score, KFold -from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from sklearn.model_selection import GroupShuffleSplit +from utils import remove_iqr_outliers, apply_iqr_bounds from argparse import ArgumentParser warnings.filterwarnings("ignore") -# Load Data parser = ArgumentParser() parser.add_argument("--input", type=str, required=True) args = parser.parse_args() + +print("="*70) +print("COMBINATION THERAPY PRE-TREATMENT MODEL - FINAL") +print("="*70) + +# ======================================== +# 1. LOAD & FILTER DATA +# ======================================== +print("\n[1] Loading data...") df = pd.read_csv(args.input) -# Filter for Post_treatment df_pre_combo = df[(df['Timeline'] == 'Pre_treatment') & (df['Treatment'] == 'anti-PDL1+Chemo')].copy() -print(f"Original shape: {df.shape}, Pre-treatment shape: {df_pre_combo.shape}") +print(f"Filtered to Pre-treatment Combo: {df_pre_combo.shape[0]} cells, {df_pre_combo['Patient_code'].nunique()} patients") -# Drop redundant/irrelevant columns +# Drop unnecessary columns drop_cols = ['Tissue', 'Identifier', 'Timeline', 'defcls', 'Treatment'] +for col in ['group', 'batch', 'myleiden']: + if col in df_pre_combo.columns: + drop_cols.append(col) + df_combo = df_pre_combo.drop(columns=drop_cols) -# Encode categorical variables -le = LabelEncoder() -categorical_cols = [ 'Response'] +# ======================================== +# 2. TRAIN/TEST SPLIT BY PATIENT (FIRST!) +# ======================================== +print("\n[2] Patient-level train/test split...") -label_mappings = {} -for col in categorical_cols: - df_combo[col] = le.fit_transform(df_combo[col]) - label_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_))) +gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42) +groups = df_combo["Patient_code"] -#One Hot Encode Origin -categorical_cols = ['Origin'] -for col in categorical_cols: - encoder = OneHotEncoder(sparse_output=False) - encoded_array = encoder.fit_transform(df_combo[[col]]) - encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([col])) - df_combo = pd.concat([df_combo.drop(columns=[col]).reset_index(drop=True), encoded_df], axis=1) +train_idx, test_idx = next(gss.split(df_combo, df_combo["Response"], groups=groups)) -#Outlier Removal +train_df = df_combo.iloc[train_idx].copy() +test_df = df_combo.iloc[test_idx].copy() -feature_cols = ["Expression", 'Origin_chest_wall', 'Origin_liver', - 'Origin_lymph_node', "nGene", "percent_mito", "percent_hsp", "percent_ig", "percent_rp", "nUMI", "PDCD1"] -label_cols = ["Response"] -df_combo_ = remove_iqr_outliers(df_combo, feature_cols) +print(f"Train: {len(train_df)} cells, {train_df['Patient_code'].nunique()} patients {sorted(train_df['Patient_code'].unique())}") +print(f"Test: {len(test_df)} cells, {test_df['Patient_code'].nunique()} patients {sorted(test_df['Patient_code'].unique())}") -#Model Training +# ======================================== +# 3. ENCODE RESPONSE (TRAIN ONLY) +# ======================================== +print("\n[3] Encoding Response...") -gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42) +response_encoder = LabelEncoder() +train_df['Response'] = response_encoder.fit_transform(train_df['Response']) +test_df['Response'] = response_encoder.transform(test_df['Response']) -# groups = your patient IDs -groups = df_combo_["Patient_code"] - -train_idx, test_idx = next(gss.split(df_combo_, df_combo_["Response"], groups=groups)) - -train_df = df_combo_.iloc[train_idx] -test_df = df_combo_.iloc[test_idx] - -# ------------------------- -# Define Model -# ------------------------- -def build_model(): - return make_pipeline( - StandardScaler(), - SVC( - kernel="rbf", - C=15, - gamma=0.1, - class_weight="balanced", - probability=True, - random_state=42 - ) - ) - -X = train_df[feature_cols] -y = train_df[label_cols] - -kf = KFold(n_splits=3, shuffle=True, random_state=42) - -best_auc = -1 -best_model = None -fold_num = 1 -best_cm = None -# ------------------------- -# KFold CV Loop -# ------------------------- -for train_idx, val_idx in kf.split(X): - - print(f"\n==============================") - print(f" Fold {fold_num}") - print("==============================") - - model = build_model() - - X_tr, X_test = X.iloc[train_idx], X.iloc[val_idx] - y_tr, y_test = y.iloc[train_idx], y.iloc[val_idx] - - # Train - model.fit(X_tr, y_tr) - - # Predict - y_pred = model.predict(X_test) - y_prob = model.predict_proba(X_test)[:, 1] - - # Metrics - acc = accuracy_score(y_test, y_pred) - auc = roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) > 1 else np.nan - cm = confusion_matrix(y_test, y_pred) - print(f"Accuracy: {acc:.4f}") - print(f"AUC-ROC: {auc:.4f}") - print("\nConfusion Matrix:\n", cm) - print("\nClassification Report:\n", classification_report(y_test, y_pred)) - - # ------------------------- - # SAVE BEST MODEL - # ------------------------- - if auc > best_auc: # <-- CHANGE TO acc > best_acc IF YOU WANT ACCURACY - best_auc = auc - best_model = model - best_cm = cm - print("šŸ”„ New best model found and stored.") - - fold_num += 1 - -print("\n==============================") -print(" BEST MODEL FROM CV") -print("==============================") -print(f"Best AUC: {best_auc:.4f}") - -#Confusion Matrix - -print("\nConfusion Matrix:\n", best_cm) - -# ------------------------- -# SAVE BEST MODEL TO .pkl -# ------------------------- -with open("combo_model.pkl", "wb") as f: - pickle.dump(best_model, f) +label_mappings = dict(zip(response_encoder.classes_, response_encoder.transform(response_encoder.classes_))) +print(f"Encoding: {label_mappings}") + +# ======================================== +# 4. ONE-HOT ENCODE ORIGIN +# ======================================== +print("\n[4] One-hot encoding Origin...") + +all_origins = pd.concat([train_df['Origin'], test_df['Origin']]).unique().reshape(-1, 1) + +origin_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') +origin_encoder.fit(all_origins) + +train_origin = origin_encoder.transform(train_df[['Origin']]) +test_origin = origin_encoder.transform(test_df[['Origin']]) + +train_origin_df = pd.DataFrame(train_origin, columns=origin_encoder.get_feature_names_out(['Origin']), index=train_df.index) +test_origin_df = pd.DataFrame(test_origin, columns=origin_encoder.get_feature_names_out(['Origin']), index=test_df.index) + +train_df = pd.concat([train_df.drop(columns=['Origin']), train_origin_df], axis=1) +test_df = pd.concat([test_df.drop(columns=['Origin']), test_origin_df], axis=1) + +print(f"Origin columns: {list(origin_encoder.get_feature_names_out(['Origin']))}") -print("\nāœ… combo model saved") \ No newline at end of file +# ======================================== +# 5. OUTLIER REMOVAL (TRAIN ONLY) +# ======================================== +print("\n[5] Outlier removal...") + +feature_cols = ["Expression", 'Origin_chest_wall', 'Origin_liver', 'Origin_lymph_node', + "nGene", "percent_mito", "percent_hsp", "percent_ig", "percent_rp", "nUMI", "PDCD1"] + +train_df, iqr_bounds = remove_iqr_outliers(train_df, feature_cols) +# test_df = apply_iqr_bounds(test_df, iqr_bounds) + +# ======================================== +# 6. PREPARE FEATURES +# ======================================== +X_train = train_df[feature_cols].values +y_train = train_df['Response'].values + +X_test = test_df[feature_cols].values +y_test = test_df['Response'].values + +print(f"\n[6] Final shapes: X_train={X_train.shape}, X_test={X_test.shape}") + +# ======================================== +# 7. CROSS-VALIDATION ASSESSMENT +# ======================================== +print("\n[7] Cross-validation assessment...") + +n_patients = train_df['Patient_code'].nunique() +print(f"Training patients: {n_patients}") +print(f"Response in train: R={np.sum(y_train)}, NR={len(y_train)-np.sum(y_train)}") + +print(f"\nāš ļø DATASET TOO SMALL FOR RELIABLE CROSS-VALIDATION") +print(f"With only {n_patients} training patients, GroupKFold would create") +print(f"validation folds with insufficient samples or single-class data.") +print(f"\nCross-validation SKIPPED (methodologically correct for this dataset size).") +print(f"Model will be evaluated on held-out test set instead.") + +# ======================================== +# 8. TRAIN MODEL ON FULL TRAINING SET +# ======================================== +print(f"\n[8] Training model on full training set...") + +model = Pipeline([ + ('scaler', StandardScaler()), + ('svm', SVC(kernel="rbf", C=15, gamma=0.1, class_weight="balanced", probability=True, random_state=42)) +]) + +model.fit(X_train, y_train.ravel()) +print("āœ… Model trained") + +# ======================================== +# 9. TRAIN SET EVALUATION +# ======================================== +print(f"\n[9] Training set performance...") + +y_train_pred = model.predict(X_train) +y_train_prob = model.predict_proba(X_train)[:, 1] + +train_acc = accuracy_score(y_train, y_train_pred) +train_auc = roc_auc_score(y_train, y_train_prob) if len(np.unique(y_train)) > 1 else np.nan + +print(f"Train Accuracy: {train_acc:.4f}") +print(f"Train AUC: {train_auc:.4f}") + +# ======================================== +# 10. TEST SET EVALUATION +# ======================================== +print(f"\n[10] Test set evaluation...") + +y_test_pred = model.predict(X_test) +y_test_prob = model.predict_proba(X_test)[:, 1] if len(np.unique(y_test)) > 1 else None + +test_acc = accuracy_score(y_test, y_test_pred) +test_auc = roc_auc_score(y_test, y_test_prob) if y_test_prob is not None and len(np.unique(y_test)) > 1 else np.nan +test_cm = confusion_matrix(y_test, y_test_pred) + +print(f"\n{'='*50}") +print("TEST SET RESULTS") +print(f"{'='*50}") +print(f"Accuracy: {test_acc:.4f}") +print(f"AUC: {test_auc:.4f}") +print(f"\nConfusion Matrix:\n{test_cm}") + +if len(np.unique(y_test)) > 1: + print(f"\nClassification Report:\n{classification_report(y_test, y_test_pred, target_names=response_encoder.classes_)}") + +# ======================================== +# 11. SAVE MODEL +# ======================================== +print(f"\n[11] Saving model...") + +model_artifact = { + 'model': model, + 'response_encoder': response_encoder, + 'origin_encoder': origin_encoder, + 'iqr_bounds': iqr_bounds, + 'feature_cols': feature_cols, + 'label_mappings': label_mappings, + 'train_patients': sorted(train_df['Patient_code'].unique()), + 'test_patients': sorted(test_df['Patient_code'].unique()), + 'test_metrics': { + 'accuracy': test_acc, + 'auc': test_auc, + 'confusion_matrix': test_cm.tolist() + }, + 'note': 'CV skipped - dataset too small for reliable GroupKFold' +} + +with open("combo_model.pkl", "wb") as f: + pickle.dump(model_artifact, f) + +print("combo_model.pkl saved") + +print("\n" + "="*70) +print("TRAINING COMPLETE") +print("="*70) +print(f"\nModel Performance:") +print(f" Train: Acc={train_acc:.3f}, AUC={train_auc:.3f}") +print(f" Test: Acc={test_acc:.3f}, AUC={test_auc:.3f}") +print(f"\nLimitation: Only {n_patients} training patients") +print(f"Results are exploratory - larger cohort needed for robust predictions") \ No newline at end of file diff --git a/flask_backend/models/BreastCancer_SVM_v1/utils.py b/flask_backend/models/BreastCancer_SVM_v1/utils.py index 634d9fee..7c242c20 100644 --- a/flask_backend/models/BreastCancer_SVM_v1/utils.py +++ b/flask_backend/models/BreastCancer_SVM_v1/utils.py @@ -1,69 +1,145 @@ -import os -import sys -import numpy as np -import pandas as pd -from typing import List - -def int_conventor(df: pd.DataFrame, col_names: list [str]) -> pd.DataFrame: - ''' - A function to convert the column names from another data type to integer - :param df: data frame - :param col_names: Column names to be converted in to integer - :return: pd.dataFrame - ''' - try: - for col in col_names: - df[col] = df[col].astype(int) - return df - - except Exception as e: - raise e +""" +Utility Functions for Breast Cancer Model Training +""" -def float_conventor(df: pd.DataFrame, col_names: list[str]) -> pd.DataFrame: - ''' - A function to convert the column names from another data type to integer - :param df: data frame - :param col_names: Column names to be converted in to integer - :return: pd.dataFrame - ''' - try: - for col in col_names: - df[col] = df[col].astype(float) - return df +import pandas as pd +import numpy as np - except Exception as e: - raise e -def remove_cols(df: pd.DataFrame, col_names: list[str]) -> pd.DataFrame: - ''' - A function to remove columns from a data type - :param df: - :param col_names: - :return: pd.DataFrame - ''' +def remove_iqr_outliers(df, feature_cols, multiplier=1.5): + """ + Remove outliers using IQR method and return bounds. - try: - df = df.drop(col_names, axis=1, errors='ignore') - return df - except Exception as e: - raise e + Args: + df: DataFrame containing the data + feature_cols: List of feature column names to check for outliers + multiplier: IQR multiplier for outlier detection (default: 1.5) + + Returns: + clean_df: DataFrame with outliers removed + iqr_bounds: Dictionary of {column: (lower_bound, upper_bound)} + """ + iqr_bounds = {} + mask = pd.Series(True, index=df.index) + + for col in feature_cols: + # Skip if column doesn't exist or is not numeric + if col not in df.columns: + continue + + if df[col].dtype == 'object': + continue + + # Calculate IQR + Q1 = df[col].quantile(0.25) + Q3 = df[col].quantile(0.75) + IQR = Q3 - Q1 + + # Calculate bounds + lower = Q1 - multiplier * IQR + upper = Q3 + multiplier * IQR + + # Store bounds for later use on test set + iqr_bounds[col] = (lower, upper) + + # Create mask for this column + col_mask = (df[col] >= lower) & (df[col] <= upper) + mask = mask & col_mask + + # Apply mask + clean_df = df[mask].copy() + + print(f" IQR outlier removal: {len(df)} -> {len(clean_df)} rows ({len(df)-len(clean_df)} outliers removed)") + + return clean_df, iqr_bounds -def remove_iqr_outliers(df, cols): +def apply_iqr_bounds(df, iqr_bounds): """ - Removes outliers using IQR method for the given numeric columns. - Keeps rows where each feature is within [Q1 - 1.5*IQR, Q3 + 1.5*IQR]. + Apply pre-computed IQR bounds to filter a dataset. + + This function is used to apply training-set outlier bounds to test data, + ensuring no test data statistics leak into preprocessing. + + Args: + df: DataFrame to filter + iqr_bounds: Dictionary of {column: (lower_bound, upper_bound)} from training set + + Returns: + filtered_df: DataFrame with outliers removed based on training bounds """ - df_clean = df.copy() - - for col in cols: - Q1 = df_clean[col].quantile(0.25) - Q3 = df_clean[col].quantile(0.75) - IQR = Q3 - Q1 + mask = pd.Series(True, index=df.index) + + for col, (lower, upper) in iqr_bounds.items(): + if col in df.columns: + col_mask = (df[col] >= lower) & (df[col] <= upper) + mask = mask & col_mask + + filtered_df = df[mask].copy() + + print(f" Applied IQR bounds: {len(df)} -> {len(filtered_df)} rows ({len(df)-len(filtered_df)} outliers removed)") + + return filtered_df - lower_bound = Q1 - 1.5 * IQR - upper_bound = Q3 + 1.5 * IQR - df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)] - - return df_clean +def validate_preprocessing_pipeline(train_df, test_df, feature_cols): + """ + Validate that preprocessing was done correctly (no data leakage). + + Args: + train_df: Training dataframe + test_df: Test dataframe + feature_cols: List of feature columns + + Returns: + dict: Validation results + """ + validation = { + 'train_test_overlap': False, + 'feature_columns_match': False, + 'no_nulls_train': False, + 'no_nulls_test': False + } + + # Check for patient overlap + train_patients = set(train_df['Patient_code'].unique()) + test_patients = set(test_df['Patient_code'].unique()) + overlap = train_patients & test_patients + + if len(overlap) == 0: + validation['train_test_overlap'] = True + print("āœ… No patient overlap between train and test") + else: + print(f" Patient overlap detected: {overlap}") + + # Check feature columns + train_features = set(train_df.columns) & set(feature_cols) + test_features = set(test_df.columns) & set(feature_cols) + + if train_features == test_features: + validation['feature_columns_match'] = True + print("āœ… Feature columns match between train and test") + else: + missing_in_test = train_features - test_features + missing_in_train = test_features - train_features + if missing_in_test: + print(f" Features missing in test: {missing_in_test}") + if missing_in_train: + print(f" Features missing in train: {missing_in_train}") + + # Check for nulls + if not train_df[feature_cols].isnull().any().any(): + validation['no_nulls_train'] = True + print("āœ… No nulls in training features") + else: + null_cols = train_df[feature_cols].columns[train_df[feature_cols].isnull().any()].tolist() + print(f" Nulls found in training features: {null_cols}") + + if not test_df[feature_cols].isnull().any().any(): + validation['no_nulls_test'] = True + print("āœ… No nulls in test features") + else: + null_cols = test_df[feature_cols].columns[test_df[feature_cols].isnull().any()].tolist() + print(f" Nulls found in test features: {null_cols}") + + return validation \ No newline at end of file