Compare revisions

CHOUMMIKH Meriam · CHOUMMIKH Meriam · 203c1286 · 203c1286
--- a/binary_classification_workflow.py
+++ b/binary_classification_workflow.py
@@ -21,46 +21,192 @@ from sklearn.model_selection import learning_curve
 from sklearn.model_selection import GridSearchCV


-"""
-kideney data
-data description : 25 features  ( 11  numeric ,14  nominal)
-    Numerical Data (11):
-        1. age: Age in years
-        2. bp: Blood Pressure in mm/Hg
-        3. bgr: Blood Glucose Random in mg/dL
-        4. bu: Blood Urea in mg/dL
-        5. sc: Serum Creatinine in mg/dL
-        6. sod: Sodium in mEq/L
-        7. pot: Potassium in mEq/L
-        8. hemo: Hemoglobin in gms
-        9. pcv: Packed Cell Volume (unit not specified)
-        10. wc: White Blood Cell Count in cells/cumm
-        11. rc: Red Blood Cell Count in millions/cmm
+######### Main function for preprocessing ############
+def pre_process_data(data):
+    """
+    Pre-process the data using the defined sub functions.
+
+    Parameters:
+    -----------
+    data (DataFrame): The dataset.
+
+    Returns:
+    ----------
+
+    data: preprocessed data.
+    """
+
+    numerical_cols = get_numerical_columns(data)
+    categorical_cols = get_categorical_columns(data)
+
+    scale_normalize(data,numerical_cols)
+    if categorical_cols != []:
+        fill_categorical_kidney(data, categorical_cols)
+        data = convert_categorical_feats(data, categorical_cols)
+
+    fill_numerical_columns(data, skew_threshold=0.5)
+
+    return data
+
+
+######### Main function for preparing the data for the training phase ############
+def prepare_training_data(data, target, use_pca=True, threshold_variance_ratio=0.90):
+    """
+    Prepare the data for training phase.
+
+    Parameters:
+    -----------
+    data (DataFrame): The dataset.
+    """
+
+    if use_pca:
+
+        df, explainable_ratios= feature_selection(data, target, threshold_variance_ratio=0.90)
+        #Explainable plots
+        fig, axes = plt.subplots(1, 2, figsize=(10, 4)) 
+        axes[0].scatter(df.loc[df['classification'] == 0, 'PCA1'], df.loc[df['classification'] == 0, 'PCA2'], color='red', label="CKD")
+        axes[0].scatter(df.loc[df['classification'] == 1, 'PCA1'], df.loc[df['classification'] == 1, 'PCA2'], color='green', label="NOT CKD")
+        axes[0].set_xlabel("PCA1: First component")
+        axes[0].set_ylabel("PCA2: Second component")
+        axes[0].set_title('Data projection onto the first two PCA components')
+        axes[0].legend()
+
+        axes[1].plot(range(1,len(explainable_ratios)+1), explainable_ratios)
+        axes[1].axhline(y=0.90, linestyle='--', color='red', label='Threshold Ratio')
+        axes[1].set_xlabel('Number of components')
+        axes[1].set_ylabel('Cumulative Explainable variance')
+        axes[1].set_title("Cumulative Explainable variance vs Number of components")
+
+        plt.tight_layout()
+        plt.subplots_adjust(wspace=0.4)
+        plt.show()
+
+    X_train, X_test, y_train, y_test, cv = split(df, target ,alpha=0.2,n=5)
+    return X_train, X_test, y_train, y_test, cv
+
+
+######### Main function for training the data ############
+def train_and_tune_model(X_train, y_train, model, param_grid, cv, scoring = "f1", verbose=1):
+    """
+    Train and fine-tune a binary classification model using GridSearchCV.
+
+    Parameters
+    ----------
+    X_train : DataFrame
+        Training data features.
+    y_train : Series
+        Training data target.
+    model : Estimator object
+        The binary classification model to be trained.
+    param_grid : dict
+        The hyperparameter grid to use for fine-tuning the model.
+    cv : Cross-validation strategy
+        The cross-validation splitting strategy.
+    scoring : str or list of str, optional
+        The scoring metric(s) to use for evaluation. Default is 'f1-score'.
+    verbose : int, optional
+        The verbosity level.
+
+    Returns
+    -------
+    best_model : Estimator object
+        The best model found during the GridSearchCV process.
+    """
+    # Setting up the GridSearchCV
+    grid_search = GridSearchCV(estimator=model, 
+                               param_grid=param_grid, 
+                               cv=cv, 
+                               scoring=scoring,  
+                               n_jobs=-1,  # Use parallel processing
+                               verbose=verbose)  # amount of messaging (information) output 
+
+    # Fitting the model
+    grid_search.fit(X_train, y_train)
+
+    # Retrieving the best model
+    best_model = grid_search.best_estimator_
+
+    return best_model
+
+# Example usage
+# from sklearn.ensemble import RandomForestClassifier
+# model = RandomForestClassifier()
+# param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
+# best_rf_model = train_and_tune_model(X_train, y_train, model, param_grid, cv)
+
+
+######### Main function to display the results for comparison purposes ############
+def display_results(dict_models, X_train, y_train, X_test, y_test, cv, disp_col):
+    """
+    Display the F1 scores of different models in a DataFrame for comparison purposes.
+
+    Parameters
+    ----------
+    dict_models : dict
+        Contains the models that we want to compare along with their parameter grids.
+    X_train : DataFrame
+        Training data features.
+    y_train : Series
+        Training data target
+    X_test : DataFrame
+        Test data features
+    y_test : Series
+        Test data target
+    cv : StratifiedKFold
+        Cross-validation strategy
+    disp_col : str
+        Name of the column to be displayed
+
+    Returns
+    -------
+    df_results : DataFrame
+        DataFrame with the F1 scores.
+    """
    
-    Nominal Data (14):
-        1. sg: Specific Gravity (nominal, 
-                                 categories: 1.005, 1.010, 1.015, 1.020, 1.025)
-        2. al: Albumin (nominal, categories: 0, 1, 2, 3, 4, 5)
-        3. su: Sugar (nominal, categories: 0, 1, 2, 3, 4, 5)
-        4. rbc: Red Blood Cells (nominal, categories: normal, abnormal)
-        5. pc: Pus Cell (nominal, categories: normal, abnormal)
-        6. pcc: Pus Cell Clumps (nominal, categories: present, not present)
-        7. ba: Bacteria (nominal, categories: present, not present)
-        8. htn: Hypertension (nominal, categories: yes, no)
-        9. dm: Diabetes Mellitus (nominal, categories: yes, no)
-        10. cad: Coronary Artery Disease (nominal, categories: yes, no)
-        11. appet: Appetite (nominal, categories: good, poor)
-        12. pe: Pedal Edema (nominal, categories: yes, no)
-        13. ane: Anemia (nominal, categories: yes, no)
-        14. classification: Class (nominal, categories: ckd, notckd)
+    df_results = pd.DataFrame(columns=["Model Name",disp_col])

-"""
+    for model_name, model_details in tqdm(dict_models.items(), desc="Going through each model defined in the dictionnary..."):
+        #extract the details related to every model from the dict
+        model_params = model_details["param_grid"]
+        model = model_details["model"]
+        best_model = train_and_tune_model(X_train, y_train, model, model_params, cv)
+        score = test_model(X_test, y_test, best_model) #evaluate f1 score on test data
+        rounded_score = np.round(score*100,2)
+        new_row = {"Model Name": model_name, disp_col: rounded_score}
+        df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)

-# numerical_columns = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo',
-#                       'pcv', 'wc', 'rc']
+        
+        conf_matrix = confusion_matrix(y_test, best_model.predict(X_test))
+        plt.figure(figsize=(8, 6))
+        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
+        plt.title(f'Confusion Matrix - {model_name}')
+        plt.xlabel('Predicted')
+        plt.ylabel('True')
+        plt.show()
+
+        # Print and analyze additional evaluation metrics
+        y_pred = best_model.predict(X_test)
+        print(f'Model: {model_name}')
+        print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
+        print(f'Precision: {precision_score(y_test, y_pred)}')
+        print(f'Recall: {recall_score(y_test, y_pred)}')
+        print(f'ROC-AUC: {roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])}')
+        print('\n')
+
+        # Plot learning curves
+        train_sizes, train_scores, valid_scores = learning_curve(best_model, X_train, y_train, cv=cv, scoring='f1', n_jobs=-1)
+        plt.figure(figsize=(8, 6))
+        plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training F1 Score')
+        plt.plot(train_sizes, np.mean(valid_scores, axis=1), label='Validation F1 Score')
+        plt.xlabel('Training Examples')
+        plt.ylabel('F1 Score')
+        plt.legend()
+        plt.title(f'Learning Curves - {model_name}')
+        plt.show()
+    # Apply styling after creating the DataFrame
+    df_results = df_results.style.highlight_max(subset=[disp_col], color='salmon') #highlight the model with the higher f1 score
+    return df_results

-# nominal_columns = ['sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'htn', 
-#                     'dm', 'cad', 'appet', 'pe', 'ane', 'classification']

 # Sub-Function to Identify Categorical Columns
 def get_categorical_columns(data):
@@ -202,64 +348,6 @@ def has_outliers(df, col, z_score_threshold=3):
    z_scores = np.abs(stats.zscore(df[col].dropna()))
    return any(z_scores > z_score_threshold)

-# def remove_outliers_iqr(df, col):
-#     """
-#     Remove outliers from a DataFrame based on the Interquartile Range (IQR) method.
-
-#     Parameters
-#     ----------
-#     df : Pandas.DataFrame
-#         The DataFrame containing the data of interest.
-#     col : str
-#         Column name from which to remove outliers.
-
-#     Returns
-#     -------
-#     DataFrame: Modified DataFrame with outliers removed.
-#     """
-#     Q1 = df[col].quantile(0.25)
-#     Q3 = df[col].quantile(0.75)
-#     IQR = Q3 - Q1
-#     lower_bound = Q1 - 1.5 * IQR
-#     upper_bound = Q3 + 1.5 * IQR
-
-#     # Filtering out the outliers
-#     return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
-
-    
-# skewed_columns_kidney = ['bp', 'age', 'bgr', 'bu', 'sc', 'pot', 'pcv', 'wc', 'rc']
-# symmetric_columns_kidney = ['sod', 'hemo']
-
-
-# def fill_numerical_kidney(df):
-#     """
-#     Parameters
-#     ----------
-#     df : Pandas.DataFrame
-#         must be the kidney data frame.
-#     Returns
-#     -------
-#     None. This function fills the numerical columns of the kidney df.
-
-#     """
-#     df.drop(df[df['wc'] == '\t?'].index, inplace=True)
-#     df.drop(df[df['pcv'] == '\t?'].index, inplace=True)
-#     df.drop(df[df['rc'] == '\t?'].index, inplace=True)
-#     for col in skewed_columns_kidney:
-#         fill_median(df, col)
-
-#     for col in symmetric_columns_kidney:
-#         fill_mean(df, col) 
-    
-# fill_numerical_kidney(df_kidney)
-
-# df_kidney.info()
-# nan_count = df_kidney[df_kidney.isna().any(axis=1)].shape[0]
-# print(f"Number of rows : {len(df_kidney)}")
-# print(f"Number of rows with at least one NaN value: {nan_count}")
-# print(f"{round(nan_count/len(df_kidney) * 100)}% of our rows have at least one"
-#       f" missing value") 
-

 def fill_numerical_columns(df, skew_threshold=0.5):
    """
@@ -464,78 +552,6 @@ def feature_selection(df, target, threshold_variance_ratio=0.99):
    return result_df, explained_variance_ratios


-def display_results(dict_models, X_train, y_train, X_test, y_test, cv, disp_col):
-    """
-    Display the F1 scores of different models in a DataFrame for comparison purposes.
-
-    Parameters
-    ----------
-    dict_models : dict
-        Contains the models that we want to compare along with their parameter grids.
-    X_train : DataFrame
-        Training data features.
-    y_train : Series
-        Training data target
-    X_test : DataFrame
-        Test data features
-    y_test : Series
-        Test data target
-    cv : StratifiedKFold
-        Cross-validation strategy
-    disp_col : str
-        Name of the column to be displayed
-
-    Returns
-    -------
-    df_results : DataFrame
-        DataFrame with the F1 scores.
-    """
-    
-    df_results = pd.DataFrame(columns=["Model Name",disp_col])
-
-    for model_name, model_details in tqdm(dict_models.items(), desc="Going through each model defined in the dictionnary..."):
-        #extract the details related to every model from the dict
-        model_params = model_details["param_grid"]
-        model = model_details["model"]
-        best_model = train_and_tune_model(X_train, y_train, model, model_params, cv)
-        score = test_model(X_test, y_test, best_model) #evaluate f1 score on test data
-        rounded_score = np.round(score*100,2)
-        new_row = {"Model Name": model_name, disp_col: rounded_score}
-        df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)
-
-        
-        conf_matrix = confusion_matrix(y_test, best_model.predict(X_test))
-        plt.figure(figsize=(8, 6))
-        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
-        plt.title(f'Confusion Matrix - {model_name}')
-        plt.xlabel('Predicted')
-        plt.ylabel('True')
-        plt.show()
-
-        # Print and analyze additional evaluation metrics
-        y_pred = best_model.predict(X_test)
-        print(f'Model: {model_name}')
-        print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
-        print(f'Precision: {precision_score(y_test, y_pred)}')
-        print(f'Recall: {recall_score(y_test, y_pred)}')
-        print(f'ROC-AUC: {roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])}')
-        print('\n')
-
-        # Plot learning curves
-        train_sizes, train_scores, valid_scores = learning_curve(best_model, X_train, y_train, cv=cv, scoring='f1', n_jobs=-1)
-        plt.figure(figsize=(8, 6))
-        plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training F1 Score')
-        plt.plot(train_sizes, np.mean(valid_scores, axis=1), label='Validation F1 Score')
-        plt.xlabel('Training Examples')
-        plt.ylabel('F1 Score')
-        plt.legend()
-        plt.title(f'Learning Curves - {model_name}')
-        plt.show()
-    # Apply styling after creating the DataFrame
-    df_results = df_results.style.highlight_max(subset=[disp_col], color='salmon') #highlight the model with the higher f1 score
-    return df_results
-
-
 def test_model(X_test, y_test, model):
    """
    Evaluate the F1 score of a trained model on the test set.
@@ -560,54 +576,6 @@ def test_model(X_test, y_test, model):
    return score


-def train_and_tune_model(X_train, y_train, model, param_grid, cv, scoring = "f1", verbose=1):
-    """
-    Train and fine-tune a binary classification model using GridSearchCV.
-
-    Parameters
-    ----------
-    X_train : DataFrame
-        Training data features.
-    y_train : Series
-        Training data target.
-    model : Estimator object
-        The binary classification model to be trained.
-    param_grid : dict
-        The hyperparameter grid to use for fine-tuning the model.
-    cv : Cross-validation strategy
-        The cross-validation splitting strategy.
-    scoring : str or list of str, optional
-        The scoring metric(s) to use for evaluation. Default is 'f1-score'.
-    verbose : int, optional
-        The verbosity level.
-
-    Returns
-    -------
-    best_model : Estimator object
-        The best model found during the GridSearchCV process.
-    """
-    # Setting up the GridSearchCV
-    grid_search = GridSearchCV(estimator=model, 
-                               param_grid=param_grid, 
-                               cv=cv, 
-                               scoring=scoring,  
-                               n_jobs=-1,  # Use parallel processing
-                               verbose=verbose)  # amount of messaging (information) output 
-
-    # Fitting the model
-    grid_search.fit(X_train, y_train)
-
-    # Retrieving the best model
-    best_model = grid_search.best_estimator_
-
-    return best_model
-
-# Example usage
-# from sklearn.ensemble import RandomForestClassifier
-# model = RandomForestClassifier()
-# param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
-# best_rf_model = train_and_tune_model(X_train, y_train, model, param_grid, cv)
-
    
    

--- a/main.ipynb
+++ b/main.ipynb
No results found