diff --git a/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl b/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl
index d463dfb9..41d5e5c7 100644
Binary files a/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl and b/flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl differ
diff --git a/flask_backend/models/BreastCancer_SVM_v1/chemo_model_training.py b/flask_backend/models/BreastCancer_SVM_v1/chemo_model_training.py
index 14a152cd..722e5d7d 100644
--- a/flask_backend/models/BreastCancer_SVM_v1/chemo_model_training.py
+++ b/flask_backend/models/BreastCancer_SVM_v1/chemo_model_training.py
@@ -1,20 +1,17 @@
-
-# Breast Cancer Response Prediction (Post-treatment) Combinational Treatment  
-# This notebook trains a SVM Classifier to predict patient response based on post-treatment intervention data.
+"""
+Breast Cancer Response Prediction - Chemo Pre-treatment
+"""
 
 import warnings
 import pickle
 import pandas as pd
 import numpy as np
-from sklearn.model_selection import KFold
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import GroupKFold, GroupShuffleSplit
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
 from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
 from sklearn.svm import SVC
-from utils import remove_iqr_outliers
-from sklearn.model_selection import cross_val_score, KFold
-from sklearn.preprocessing import LabelEncoder, OneHotEncoder
-from sklearn.model_selection import GroupShuffleSplit
+from utils import remove_iqr_outliers, apply_iqr_bounds
 from argparse import ArgumentParser
 
 warnings.filterwarnings("ignore")
@@ -23,131 +20,268 @@
 parser.add_argument("--input", type=str, required=True)
 args = parser.parse_args()
 
-# Load Data
+print("="*70)
+print("CHEMO PRE-TREATMENT MODEL TRAINING")
+print("="*70)
+
+# ========================================
+# 1. LOAD & FILTER DATA
+# ========================================
+print("\n[1] Loading and filtering data...")
 df = pd.read_csv(args.input)
 
-# Filter for Post_treatment
 df_pre_chemo = df[(df['Timeline'] == 'Pre_treatment') & (df['Treatment'] == 'Chemo')].copy()
-print(f"Original shape: {df.shape}, Pre-treatment shape: {df_pre_chemo.shape}")
+print(f"Original shape: {df.shape}")
+print(f"Pre-treatment Chemo shape: {df_pre_chemo.shape}")
+print(f"Patients: {df_pre_chemo['Patient_code'].nunique()}")
+print(f"Response distribution:\n{df_pre_chemo['Response'].value_counts()}")
 
 # Drop redundant/irrelevant columns
 drop_cols = ['Tissue', 'Identifier', 'Timeline', 'defcls', 'Treatment']
-df_chemo = df_pre_chemo.drop(columns=drop_cols)
-
-# Encode categorical variables
-le = LabelEncoder()
-categorical_cols = ['Response']
-
-label_mappings = {}
-for col in categorical_cols:
-    df_chemo[col] = le.fit_transform(df_chemo[col])
-    label_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))
+if 'group' in df_pre_chemo.columns:
+    drop_cols.append('group')
+if 'batch' in df_pre_chemo.columns:
+    drop_cols.append('batch')
+if 'myleiden' in df_pre_chemo.columns:
+    drop_cols.append('myleiden')
 
-#One Hot Encode Origin
-categorical_cols = ['Origin']
-for col in categorical_cols:
-    encoder = OneHotEncoder(sparse_output=False)
-    encoded_array = encoder.fit_transform(df_chemo[[col]])
-    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([col]))
-    df_chemo = pd.concat([df_chemo.drop(columns=[col]).reset_index(drop=True), encoded_df], axis=1)
-
-#Outlier Removal
-
-feature_cols = ["Expression", "Origin_breast", "Origin_liver", "nGene",	"percent_mito",	"percent_hsp",	"percent_ig",	"percent_rp",	"nUMI",	"PDCD1"]
-label_cols = ["Response"]
-df_chemo_ = remove_iqr_outliers(df_chemo, feature_cols)
+df_chemo = df_pre_chemo.drop(columns=drop_cols)
 
-#Model Training
+# ========================================
+# 2. TRAIN/TEST SPLIT
+# ========================================
+print("\n[2] Splitting train/test by patient...")
 
 gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
-
-# groups = your patient IDs
-groups = df_chemo_["Patient_code"]
-
-train_idx, test_idx = next(gss.split(df_chemo_, df_chemo_["Response"], groups=groups))
-
-train_df = df_chemo_.iloc[train_idx]
-test_df  = df_chemo_.iloc[test_idx]
-
-# -------------------------
-# Define Model
-# -------------------------
+groups = df_chemo["Patient_code"]
+
+train_idx, test_idx = next(gss.split(df_chemo, df_chemo["Response"], groups=groups))
+
+train_df = df_chemo.iloc[train_idx].copy()
+test_df = df_chemo.iloc[test_idx].copy()
+
+# ========================================
+# 3. ENCODE RESPONSE (FIT ON TRAIN ONLY)
+# ========================================
+print("\n[3] Encoding Response variable (fit on train only)...")
+
+response_encoder = LabelEncoder()
+train_df['Response'] = response_encoder.fit_transform(train_df['Response'])
+test_df['Response'] = response_encoder.transform(test_df['Response'])
+
+# ========================================
+# 4. ONE-HOT ENCODE ORIGIN
+# FIT ON ALL UNIQUE VALUES (EXCEPTION TO THE RULE)
+# ========================================
+print("\n[4] One-hot encoding Origin...")
+
+# Get ALL unique Origin values from both train and test
+all_origins = pd.concat([train_df['Origin'], test_df['Origin']]).unique().reshape(-1, 1)
+
+# Fit encoder on all possible categories
+origin_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
+origin_encoder.fit(all_origins)
+
+print(f"Origin categories: {origin_encoder.categories_[0]}")
+
+# Transform both train and test
+train_origin_encoded = origin_encoder.transform(train_df[['Origin']])
+test_origin_encoded = origin_encoder.transform(test_df[['Origin']])
+
+# Create DataFrames
+train_origin_df = pd.DataFrame(
+    train_origin_encoded,
+    columns=origin_encoder.get_feature_names_out(['Origin']),
+    index=train_df.index
+)
+test_origin_df = pd.DataFrame(
+    test_origin_encoded,
+    columns=origin_encoder.get_feature_names_out(['Origin']),
+    index=test_df.index
+)
+
+# Concatenate
+train_df = pd.concat([train_df.drop(columns=['Origin']), train_origin_df], axis=1)
+test_df = pd.concat([test_df.drop(columns=['Origin']), test_origin_df], axis=1)
+
+print(f"Origin columns created: {list(origin_encoder.get_feature_names_out(['Origin']))}")
+
+# ========================================
+# 5. OUTLIER REMOVAL (FIT ON TRAIN ONLY)
+# ========================================
+print("\n[5] Removing outliers (fit on train only)...")
+
+feature_cols = ["Expression", "Origin_breast", "Origin_liver", "nGene", 
+                "percent_mito", "percent_hsp", "percent_ig", "percent_rp", 
+                "nUMI", "PDCD1"]
+
+print(f"Train before outlier removal: {len(train_df)} cells")
+train_df, iqr_bounds = remove_iqr_outliers(train_df, feature_cols)
+print(f"Train after outlier removal: {len(train_df)} cells")
+
+print(f"Test before outlier removal: {len(test_df)} cells")
+test_df = apply_iqr_bounds(test_df, iqr_bounds)
+print(f"Test after outlier removal: {len(test_df)} cells")
+
+# ========================================
+# 6. PREPARE FEATURES
+# ========================================
+X_train = train_df[feature_cols].values
+y_train = train_df['Response'].values
+groups_train = train_df['Patient_code'].values
+
+X_test = test_df[feature_cols].values
+y_test = test_df['Response'].values
+
+print(f"\n[6] Feature shapes:")
+print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
+print(f"X_test:  {X_test.shape}, y_test:  {y_test.shape}")
+
+# ========================================
+# 7. DEFINE MODEL
+# ========================================
 def build_model():
-    return make_pipeline(
-        StandardScaler(),
-        SVC(
+    return Pipeline([
+        ('scaler', StandardScaler()),
+        ('svm', SVC(
             kernel="rbf",
             C=15,
             gamma=0.1,
             class_weight="balanced",
             probability=True,
             random_state=42
-        )
-    )
+        ))
+    ])
+
+# ========================================
+# 8. CROSS-VALIDATION (GroupKFold)
+# ========================================
+print("\n[7] Cross-validation with GroupKFold...")
 
-X = train_df[feature_cols]
-y = train_df[label_cols]
+n_patients = train_df['Patient_code'].nunique()
+n_folds = min(3, n_patients)
+print(f"Using {n_folds}-fold GroupKFold CV")
 
-kf = KFold(n_splits=3, shuffle=True, random_state=42)
+gkf = GroupKFold(n_splits=n_folds)
 
-best_auc = -1
-best_model = None
+cv_results = []
 fold_num = 1
-best_cm = None
-# -------------------------
-# KFold CV Loop
-# -------------------------
-for train_idx, val_idx in kf.split(X):
 
-    print(f"\n==============================")
-    print(f" Fold {fold_num}")
-    print("==============================")
+for train_idx, val_idx in gkf.split(X_train, y_train, groups_train):
+    print(f"\n{'='*50}")
+    print(f"Fold {fold_num}/{n_folds}")
+    print(f"{'='*50}")
 
     model = build_model()
 
-    X_tr, X_test = X.iloc[train_idx], X.iloc[val_idx]
-    y_tr, y_test = y.iloc[train_idx], y.iloc[val_idx]
+    X_tr, X_val = X_train[train_idx], X_train[val_idx]
+    y_tr, y_val = y_train[train_idx], y_train[val_idx]
+
+    # Get patient IDs for this fold
+    train_pats = train_df.iloc[train_idx]['Patient_code'].unique()
+    val_pats = train_df.iloc[val_idx]['Patient_code'].unique()
+
+    print(f"Train patients: {sorted(train_pats)}")
+    print(f"Val patients:   {sorted(val_pats)}")
+    print(f"Train: {len(X_tr)} cells, Val: {len(X_val)} cells")
 
     # Train
-    model.fit(X_tr, y_tr)
+    model.fit(X_tr, y_tr.ravel())
 
     # Predict
-    y_pred = model.predict(X_test)
-    y_prob = model.predict_proba(X_test)[:, 1]
+    y_pred = model.predict(X_val)
+    y_prob = model.predict_proba(X_val)[:, 1]
 
     # Metrics
-    acc = accuracy_score(y_test, y_pred)
-    auc = roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) > 1 else np.nan
-    cm = confusion_matrix(y_test, y_pred)
-    print(f"Accuracy: {acc:.4f}")
-    print(f"AUC-ROC: {auc:.4f}")
-    print("\nConfusion Matrix:\n", cm)
-    print("\nClassification Report:\n", classification_report(y_test, y_pred))
-
-    # -------------------------
-    # SAVE BEST MODEL
-    # -------------------------
-    if auc > best_auc:   # <-- CHANGE TO acc > best_acc IF YOU WANT ACCURACY
-        best_auc = auc
-        best_model = model
-        best_cm = cm    
-        print("🔥 New best model found and stored.")
+    acc = accuracy_score(y_val, y_pred)
+    auc = roc_auc_score(y_val, y_prob) if len(np.unique(y_val)) > 1 else np.nan
+    cm = confusion_matrix(y_val, y_pred)
 
-    fold_num += 1
+    print(f"Accuracy: {acc:.4f}")
+    print(f"AUC-ROC:  {auc:.4f}")
+    print(f"Confusion Matrix:\n{cm}")
 
-print("\n==============================")
-print(" BEST MODEL FROM CV")
-print("==============================")
-print(f"Best AUC: {best_auc:.4f}")
+    cv_results.append({
+        'fold': fold_num,
+        'accuracy': acc,
+        'auc': auc
+    })
 
-#Confusion Matrix
+    fold_num += 1
 
-print("\nConfusion Matrix:\n", best_cm)
+# CV Summary
+cv_df = pd.DataFrame(cv_results)
+print(f"\n{'='*50}")
+print("CROSS-VALIDATION SUMMARY")
+print(f"{'='*50}")
+print(cv_df.to_string(index=False))
+print(f"\nMean Accuracy: {cv_df['accuracy'].mean():.4f} ± {cv_df['accuracy'].std():.4f}")
+if not cv_df['auc'].isna().all():
+    print(f"Mean AUC:      {cv_df['auc'].mean():.4f} ± {cv_df['auc'].std():.4f}")
+
+# ========================================
+# 9. TRAIN FINAL MODEL (on full training set)
+# ========================================
+print(f"\n[8] Training final model on FULL training set...")
+
+final_model = build_model()
+final_model.fit(X_train, y_train.ravel())
+
+# ========================================
+# 10. EVALUATE ON TEST SET
+# ========================================
+print(f"\n[9] Evaluating on held-out test set...")
+
+y_test_pred = final_model.predict(X_test)
+y_test_prob = final_model.predict_proba(X_test)[:, 1] if len(np.unique(y_test)) > 1 else None
+
+test_acc = accuracy_score(y_test, y_test_pred)
+test_auc = roc_auc_score(y_test, y_test_prob) if y_test_prob is not None and len(np.unique(y_test)) > 1 else np.nan
+test_cm = confusion_matrix(y_test, y_test_pred)
+
+print(f"\n{'='*50}")
+print("TEST SET EVALUATION")
+print(f"{'='*50}")
+print(f"Accuracy: {test_acc:.4f}")
+print(f"AUC-ROC:  {test_auc:.4f}")
+print(f"Confusion Matrix:\n{test_cm}")
+if len(np.unique(y_test)) > 1:
+    print(f"\nClassification Report:\n{classification_report(y_test, y_test_pred, target_names=response_encoder.classes_)}")
+
+# ========================================
+# 11. SAVE MODEL & ARTIFACTS
+# ========================================
+print(f"\n[10] Saving model and preprocessing artifacts...")
+
+model_artifact = {
+    'model': final_model,
+    'response_encoder': response_encoder,
+    'origin_encoder': origin_encoder,
+    'iqr_bounds': iqr_bounds,
+    'feature_cols': feature_cols,
+    'train_patients': sorted(train_df['Patient_code'].unique()),
+    'test_patients': sorted(test_df['Patient_code'].unique()),
+    'cv_results': cv_df.to_dict('records'),
+    'test_metrics': {
+        'accuracy': test_acc,
+        'auc': test_auc,
+        'confusion_matrix': test_cm.tolist()
+    }
+}
 
-# -------------------------
-# SAVE BEST MODEL TO .pkl
-# -------------------------
 with open("chemo_model.pkl", "wb") as f:
-    pickle.dump(best_model, f)
-
-print("\n✅ Chemo model saved")
\ No newline at end of file
+    pickle.dump(model_artifact, f)
+
+print("Chemo model saved to: chemo_model.pkl")
+print("\nModel artifact includes:")
+print("  - Trained model (with fitted StandardScaler)")
+print("  - Response encoder")
+print("  - Origin encoder")
+print("  - IQR bounds")
+print("  - Feature columns")
+print("  - CV results")
+print("  - Test metrics")
+
+print("\n" + "="*70)
+print("TRAINING COMPLETE")
+print("="*70)
\ No newline at end of file
diff --git a/flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl b/flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl
index c748ec6d..9f2d9315 100644
Binary files a/flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl and b/flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl differ
diff --git a/flask_backend/models/BreastCancer_SVM_v1/combo_model_training.py b/flask_backend/models/BreastCancer_SVM_v1/combo_model_training.py
index 44eb6323..b17d846b 100644
--- a/flask_backend/models/BreastCancer_SVM_v1/combo_model_training.py
+++ b/flask_backend/models/BreastCancer_SVM_v1/combo_model_training.py
@@ -1,153 +1,213 @@
-
-# Breast Cancer Response Prediction (Post-treatment) combo  
-# This notebook trains a SVM Classifier to predict patient response based on post-treatment intervention data.
+"""
+Breast Cancer Response Prediction - Combination Therapy Pre-treatment
+"""
 
 import warnings
 import pickle
 import pandas as pd
 import numpy as np
-from sklearn.model_selection import KFold
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import GroupShuffleSplit
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
 from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
 from sklearn.svm import SVC
-from utils import remove_iqr_outliers
-from sklearn.model_selection import cross_val_score, KFold
-from sklearn.preprocessing import LabelEncoder, OneHotEncoder
-from sklearn.model_selection import GroupShuffleSplit
+from utils import remove_iqr_outliers, apply_iqr_bounds
 from argparse import ArgumentParser
 
 warnings.filterwarnings("ignore")
 
-# Load Data
 parser = ArgumentParser()
 parser.add_argument("--input", type=str, required=True)
 args = parser.parse_args()
+
+print("="*70)
+print("COMBINATION THERAPY PRE-TREATMENT MODEL - FINAL")
+print("="*70)
+
+# ========================================
+# 1. LOAD & FILTER DATA
+# ========================================
+print("\n[1] Loading data...")
 df = pd.read_csv(args.input)
 
-# Filter for Post_treatment
 df_pre_combo = df[(df['Timeline'] == 'Pre_treatment') & (df['Treatment'] == 'anti-PDL1+Chemo')].copy()
-print(f"Original shape: {df.shape}, Pre-treatment shape: {df_pre_combo.shape}")
+print(f"Filtered to Pre-treatment Combo: {df_pre_combo.shape[0]} cells, {df_pre_combo['Patient_code'].nunique()} patients")
 
-# Drop redundant/irrelevant columns
+# Drop unnecessary columns
 drop_cols = ['Tissue', 'Identifier', 'Timeline', 'defcls', 'Treatment']
+for col in ['group', 'batch', 'myleiden']:
+    if col in df_pre_combo.columns:
+        drop_cols.append(col)
+
 df_combo = df_pre_combo.drop(columns=drop_cols)
 
-# Encode categorical variables
-le = LabelEncoder()
-categorical_cols = [ 'Response']
+# ========================================
+# 2. TRAIN/TEST SPLIT BY PATIENT (FIRST!)
+# ========================================
+print("\n[2] Patient-level train/test split...")
 
-label_mappings = {}
-for col in categorical_cols:
-    df_combo[col] = le.fit_transform(df_combo[col])
-    label_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))
+gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
+groups = df_combo["Patient_code"]
 
-#One Hot Encode Origin
-categorical_cols = ['Origin']
-for col in categorical_cols:
-    encoder = OneHotEncoder(sparse_output=False)
-    encoded_array = encoder.fit_transform(df_combo[[col]])
-    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([col]))
-    df_combo = pd.concat([df_combo.drop(columns=[col]).reset_index(drop=True), encoded_df], axis=1)
+train_idx, test_idx = next(gss.split(df_combo, df_combo["Response"], groups=groups))
 
-#Outlier Removal
+train_df = df_combo.iloc[train_idx].copy()
+test_df = df_combo.iloc[test_idx].copy()
 
-feature_cols = ["Expression",	'Origin_chest_wall', 'Origin_liver',
-       'Origin_lymph_node', "nGene",	"percent_mito",	"percent_hsp",	"percent_ig",	"percent_rp",	"nUMI",	"PDCD1"]
-label_cols = ["Response"]
-df_combo_ = remove_iqr_outliers(df_combo, feature_cols)
+print(f"Train: {len(train_df)} cells, {train_df['Patient_code'].nunique()} patients {sorted(train_df['Patient_code'].unique())}")
+print(f"Test:  {len(test_df)} cells, {test_df['Patient_code'].nunique()} patients {sorted(test_df['Patient_code'].unique())}")
 
-#Model Training
+# ========================================
+# 3. ENCODE RESPONSE (TRAIN ONLY)
+# ========================================
+print("\n[3] Encoding Response...")
 
-gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
+response_encoder = LabelEncoder()
+train_df['Response'] = response_encoder.fit_transform(train_df['Response'])
+test_df['Response'] = response_encoder.transform(test_df['Response'])
 
-# groups = your patient IDs
-groups = df_combo_["Patient_code"]
-
-train_idx, test_idx = next(gss.split(df_combo_, df_combo_["Response"], groups=groups))
-
-train_df = df_combo_.iloc[train_idx]
-test_df  = df_combo_.iloc[test_idx]
-
-# -------------------------
-# Define Model
-# -------------------------
-def build_model():
-    return make_pipeline(
-        StandardScaler(),
-        SVC(
-            kernel="rbf",
-            C=15,
-            gamma=0.1,
-            class_weight="balanced",
-            probability=True,
-            random_state=42
-        )
-    )
-
-X = train_df[feature_cols]
-y = train_df[label_cols]
-
-kf = KFold(n_splits=3, shuffle=True, random_state=42)
-
-best_auc = -1
-best_model = None
-fold_num = 1
-best_cm = None
-# -------------------------
-# KFold CV Loop
-# -------------------------
-for train_idx, val_idx in kf.split(X):
-
-    print(f"\n==============================")
-    print(f" Fold {fold_num}")
-    print("==============================")
-
-    model = build_model()
-
-    X_tr, X_test = X.iloc[train_idx], X.iloc[val_idx]
-    y_tr, y_test = y.iloc[train_idx], y.iloc[val_idx]
-
-    # Train
-    model.fit(X_tr, y_tr)
-
-    # Predict
-    y_pred = model.predict(X_test)
-    y_prob = model.predict_proba(X_test)[:, 1]
-
-    # Metrics
-    acc = accuracy_score(y_test, y_pred)
-    auc = roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) > 1 else np.nan
-    cm = confusion_matrix(y_test, y_pred)
-    print(f"Accuracy: {acc:.4f}")
-    print(f"AUC-ROC: {auc:.4f}")
-    print("\nConfusion Matrix:\n", cm)
-    print("\nClassification Report:\n", classification_report(y_test, y_pred))
-
-    # -------------------------
-    # SAVE BEST MODEL
-    # -------------------------
-    if auc > best_auc:   # <-- CHANGE TO acc > best_acc IF YOU WANT ACCURACY
-        best_auc = auc
-        best_model = model
-        best_cm = cm    
-        print("🔥 New best model found and stored.")
-
-    fold_num += 1
-
-print("\n==============================")
-print(" BEST MODEL FROM CV")
-print("==============================")
-print(f"Best AUC: {best_auc:.4f}")
-
-#Confusion Matrix
-
-print("\nConfusion Matrix:\n", best_cm)
-
-# -------------------------
-# SAVE BEST MODEL TO .pkl
-# -------------------------
-with open("combo_model.pkl", "wb") as f:
-    pickle.dump(best_model, f)
+label_mappings = dict(zip(response_encoder.classes_, response_encoder.transform(response_encoder.classes_)))
+print(f"Encoding: {label_mappings}")
+
+# ========================================
+# 4. ONE-HOT ENCODE ORIGIN
+# ========================================
+print("\n[4] One-hot encoding Origin...")
+
+all_origins = pd.concat([train_df['Origin'], test_df['Origin']]).unique().reshape(-1, 1)
+
+origin_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
+origin_encoder.fit(all_origins)
+
+train_origin = origin_encoder.transform(train_df[['Origin']])
+test_origin = origin_encoder.transform(test_df[['Origin']])
+
+train_origin_df = pd.DataFrame(train_origin, columns=origin_encoder.get_feature_names_out(['Origin']), index=train_df.index)
+test_origin_df = pd.DataFrame(test_origin, columns=origin_encoder.get_feature_names_out(['Origin']), index=test_df.index)
+
+train_df = pd.concat([train_df.drop(columns=['Origin']), train_origin_df], axis=1)
+test_df = pd.concat([test_df.drop(columns=['Origin']), test_origin_df], axis=1)
+
+print(f"Origin columns: {list(origin_encoder.get_feature_names_out(['Origin']))}")
 
-print("\n✅ combo model saved")
\ No newline at end of file
+# ========================================
+# 5. OUTLIER REMOVAL (TRAIN ONLY)
+# ========================================
+print("\n[5] Outlier removal...")
+
+feature_cols = ["Expression", 'Origin_chest_wall', 'Origin_liver', 'Origin_lymph_node',
+                "nGene", "percent_mito", "percent_hsp", "percent_ig", "percent_rp", "nUMI", "PDCD1"]
+
+train_df, iqr_bounds = remove_iqr_outliers(train_df, feature_cols)
+# test_df = apply_iqr_bounds(test_df, iqr_bounds)
+
+# ========================================
+# 6. PREPARE FEATURES
+# ========================================
+X_train = train_df[feature_cols].values
+y_train = train_df['Response'].values
+
+X_test = test_df[feature_cols].values
+y_test = test_df['Response'].values
+
+print(f"\n[6] Final shapes: X_train={X_train.shape}, X_test={X_test.shape}")
+
+# ========================================
+# 7. CROSS-VALIDATION ASSESSMENT
+# ========================================
+print("\n[7] Cross-validation assessment...")
+
+n_patients = train_df['Patient_code'].nunique()
+print(f"Training patients: {n_patients}")
+print(f"Response in train: R={np.sum(y_train)}, NR={len(y_train)-np.sum(y_train)}")
+
+print(f"\n⚠️  DATASET TOO SMALL FOR RELIABLE CROSS-VALIDATION")
+print(f"With only {n_patients} training patients, GroupKFold would create")
+print(f"validation folds with insufficient samples or single-class data.")
+print(f"\nCross-validation SKIPPED (methodologically correct for this dataset size).")
+print(f"Model will be evaluated on held-out test set instead.")
+
+# ========================================
+# 8. TRAIN MODEL ON FULL TRAINING SET
+# ========================================
+print(f"\n[8] Training model on full training set...")
+
+model = Pipeline([
+    ('scaler', StandardScaler()),
+    ('svm', SVC(kernel="rbf", C=15, gamma=0.1, class_weight="balanced", probability=True, random_state=42))
+])
+
+model.fit(X_train, y_train.ravel())
+print("✅ Model trained")
+
+# ========================================
+# 9. TRAIN SET EVALUATION
+# ========================================
+print(f"\n[9] Training set performance...")
+
+y_train_pred = model.predict(X_train)
+y_train_prob = model.predict_proba(X_train)[:, 1]
+
+train_acc = accuracy_score(y_train, y_train_pred)
+train_auc = roc_auc_score(y_train, y_train_prob) if len(np.unique(y_train)) > 1 else np.nan
+
+print(f"Train Accuracy: {train_acc:.4f}")
+print(f"Train AUC:      {train_auc:.4f}")
+
+# ========================================
+# 10. TEST SET EVALUATION
+# ========================================
+print(f"\n[10] Test set evaluation...")
+
+y_test_pred = model.predict(X_test)
+y_test_prob = model.predict_proba(X_test)[:, 1] if len(np.unique(y_test)) > 1 else None
+
+test_acc = accuracy_score(y_test, y_test_pred)
+test_auc = roc_auc_score(y_test, y_test_prob) if y_test_prob is not None and len(np.unique(y_test)) > 1 else np.nan
+test_cm = confusion_matrix(y_test, y_test_pred)
+
+print(f"\n{'='*50}")
+print("TEST SET RESULTS")
+print(f"{'='*50}")
+print(f"Accuracy: {test_acc:.4f}")
+print(f"AUC:      {test_auc:.4f}")
+print(f"\nConfusion Matrix:\n{test_cm}")
+
+if len(np.unique(y_test)) > 1:
+    print(f"\nClassification Report:\n{classification_report(y_test, y_test_pred, target_names=response_encoder.classes_)}")
+
+# ========================================
+# 11. SAVE MODEL
+# ========================================
+print(f"\n[11] Saving model...")
+
+model_artifact = {
+    'model': model,
+    'response_encoder': response_encoder,
+    'origin_encoder': origin_encoder,
+    'iqr_bounds': iqr_bounds,
+    'feature_cols': feature_cols,
+    'label_mappings': label_mappings,
+    'train_patients': sorted(train_df['Patient_code'].unique()),
+    'test_patients': sorted(test_df['Patient_code'].unique()),
+    'test_metrics': {
+        'accuracy': test_acc,
+        'auc': test_auc,
+        'confusion_matrix': test_cm.tolist()
+    },
+    'note': 'CV skipped - dataset too small for reliable GroupKFold'
+}
+
+with open("combo_model.pkl", "wb") as f:
+    pickle.dump(model_artifact, f)
+
+print("combo_model.pkl saved")
+
+print("\n" + "="*70)
+print("TRAINING COMPLETE")
+print("="*70)
+print(f"\nModel Performance:")
+print(f"  Train: Acc={train_acc:.3f}, AUC={train_auc:.3f}")
+print(f"  Test:  Acc={test_acc:.3f}, AUC={test_auc:.3f}")
+print(f"\nLimitation: Only {n_patients} training patients")
+print(f"Results are exploratory - larger cohort needed for robust predictions")
\ No newline at end of file
diff --git a/flask_backend/models/BreastCancer_SVM_v1/utils.py b/flask_backend/models/BreastCancer_SVM_v1/utils.py
index 634d9fee..7c242c20 100644
--- a/flask_backend/models/BreastCancer_SVM_v1/utils.py
+++ b/flask_backend/models/BreastCancer_SVM_v1/utils.py
@@ -1,69 +1,145 @@
-import os
-import sys
-import numpy as np
-import pandas as pd
-from typing import List
-
-def int_conventor(df: pd.DataFrame, col_names: list [str]) -> pd.DataFrame:
-    '''
-    A function to convert the column names from another data type to integer
-    :param df: data frame
-    :param col_names: Column names to be converted in to integer
-    :return: pd.dataFrame
-    '''
-    try:
-        for col in col_names:
-            df[col] = df[col].astype(int)
-        return df
-
-    except Exception as e:
-        raise e
+"""
+Utility Functions for Breast Cancer Model Training
+"""
 
-def float_conventor(df: pd.DataFrame, col_names: list[str]) -> pd.DataFrame:
-    '''
-    A function to convert the column names from another data type to integer
-    :param df: data frame
-    :param col_names: Column names to be converted in to integer
-    :return: pd.dataFrame
-    '''
-    try:
-        for col in col_names:
-            df[col] = df[col].astype(float)
-        return df
+import pandas as pd
+import numpy as np
 
-    except Exception as e:
-        raise e
 
-def remove_cols(df: pd.DataFrame, col_names: list[str]) -> pd.DataFrame:
-    '''
-    A function to remove columns from a data type
-    :param df:
-    :param col_names:
-    :return: pd.DataFrame
-    '''
+def remove_iqr_outliers(df, feature_cols, multiplier=1.5):
+    """
+    Remove outliers using IQR method and return bounds.
 
-    try:
-        df = df.drop(col_names, axis=1, errors='ignore')
-        return df
-    except Exception as e:
-        raise e
+    Args:
+        df: DataFrame containing the data
+        feature_cols: List of feature column names to check for outliers
+        multiplier: IQR multiplier for outlier detection (default: 1.5)
+    
+    Returns:
+        clean_df: DataFrame with outliers removed
+        iqr_bounds: Dictionary of {column: (lower_bound, upper_bound)}
+    """
+    iqr_bounds = {}
+    mask = pd.Series(True, index=df.index)
+    
+    for col in feature_cols:
+        # Skip if column doesn't exist or is not numeric
+        if col not in df.columns:
+            continue
+        
+        if df[col].dtype == 'object':
+            continue
+        
+        # Calculate IQR
+        Q1 = df[col].quantile(0.25)
+        Q3 = df[col].quantile(0.75)
+        IQR = Q3 - Q1
+        
+        # Calculate bounds
+        lower = Q1 - multiplier * IQR
+        upper = Q3 + multiplier * IQR
+        
+        # Store bounds for later use on test set
+        iqr_bounds[col] = (lower, upper)
+        
+        # Create mask for this column
+        col_mask = (df[col] >= lower) & (df[col] <= upper)
+        mask = mask & col_mask
+    
+    # Apply mask
+    clean_df = df[mask].copy()
+    
+    print(f"  IQR outlier removal: {len(df)} -> {len(clean_df)} rows ({len(df)-len(clean_df)} outliers removed)")
+    
+    return clean_df, iqr_bounds
 
 
-def remove_iqr_outliers(df, cols):
+def apply_iqr_bounds(df, iqr_bounds):
     """
-    Removes outliers using IQR method for the given numeric columns.
-    Keeps rows where each feature is within [Q1 - 1.5*IQR, Q3 + 1.5*IQR].
+    Apply pre-computed IQR bounds to filter a dataset.
+    
+    This function is used to apply training-set outlier bounds to test data,
+    ensuring no test data statistics leak into preprocessing.
+    
+    Args:
+        df: DataFrame to filter
+        iqr_bounds: Dictionary of {column: (lower_bound, upper_bound)} from training set
+    
+    Returns:
+        filtered_df: DataFrame with outliers removed based on training bounds
     """
-    df_clean = df.copy()
-
-    for col in cols:
-        Q1 = df_clean[col].quantile(0.25)
-        Q3 = df_clean[col].quantile(0.75)
-        IQR = Q3 - Q1
+    mask = pd.Series(True, index=df.index)
+    
+    for col, (lower, upper) in iqr_bounds.items():
+        if col in df.columns:
+            col_mask = (df[col] >= lower) & (df[col] <= upper)
+            mask = mask & col_mask
+    
+    filtered_df = df[mask].copy()
+    
+    print(f"  Applied IQR bounds: {len(df)} -> {len(filtered_df)} rows ({len(df)-len(filtered_df)} outliers removed)")
+    
+    return filtered_df
 
-        lower_bound = Q1 - 1.5 * IQR
-        upper_bound = Q3 + 1.5 * IQR
 
-        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
-
-    return df_clean
+def validate_preprocessing_pipeline(train_df, test_df, feature_cols):
+    """
+    Validate that preprocessing was done correctly (no data leakage).
+    
+    Args:
+        train_df: Training dataframe
+        test_df: Test dataframe
+        feature_cols: List of feature columns
+    
+    Returns:
+        dict: Validation results
+    """
+    validation = {
+        'train_test_overlap': False,
+        'feature_columns_match': False,
+        'no_nulls_train': False,
+        'no_nulls_test': False
+    }
+    
+    # Check for patient overlap
+    train_patients = set(train_df['Patient_code'].unique())
+    test_patients = set(test_df['Patient_code'].unique())
+    overlap = train_patients & test_patients
+    
+    if len(overlap) == 0:
+        validation['train_test_overlap'] = True
+        print("✅ No patient overlap between train and test")
+    else:
+        print(f" Patient overlap detected: {overlap}")
+    
+    # Check feature columns
+    train_features = set(train_df.columns) & set(feature_cols)
+    test_features = set(test_df.columns) & set(feature_cols)
+    
+    if train_features == test_features:
+        validation['feature_columns_match'] = True
+        print("✅ Feature columns match between train and test")
+    else:
+        missing_in_test = train_features - test_features
+        missing_in_train = test_features - train_features
+        if missing_in_test:
+            print(f" Features missing in test: {missing_in_test}")
+        if missing_in_train:
+            print(f" Features missing in train: {missing_in_train}")
+    
+    # Check for nulls
+    if not train_df[feature_cols].isnull().any().any():
+        validation['no_nulls_train'] = True
+        print("✅ No nulls in training features")
+    else:
+        null_cols = train_df[feature_cols].columns[train_df[feature_cols].isnull().any()].tolist()
+        print(f" Nulls found in training features: {null_cols}")
+    
+    if not test_df[feature_cols].isnull().any().any():
+        validation['no_nulls_test'] = True
+        print("✅ No nulls in test features")
+    else:
+        null_cols = test_df[feature_cols].columns[test_df[feature_cols].isnull().any()].tolist()
+        print(f" Nulls found in test features: {null_cols}")
+    
+    return validation
\ No newline at end of file