implement function to display results + function to test models

31b9e72b · CHOUMMIKH Meriam · 7eaa8cbb · 31b9e72b · 31b9e72b
Commit 31b9e72b authored Dec 22, 2023 by CHOUMMIKH Meriam
--- a/binary_classification_workflow.py
+++ b/binary_classification_workflow.py
@@ -15,6 +15,7 @@ from tqdm import tqdm
 from sklearn.preprocessing import MinMaxScaler, StandardScaler, OrdinalEncoder, OneHotEncoder
 from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
 from sklearn.decomposition import PCA
+from sklearn.metrics import f1_score
 """
@@ -85,7 +86,7 @@ def get_numerical_columns(data):
    -----------
    list: List of names of numerical columns.
    """
-    return data.select_dtypes(include=['int64', 'float64']).columns.tolist()
+    return data.iloc[:,:-1].select_dtypes(include=['int64', 'float64']).columns.tolist()
 def visualise_numerical_data(df,columns=None):
@@ -410,12 +411,12 @@ def convert_categorical_feats(df, categorical_cols):
    for col in categorical_cols:
        if len(df[col].value_counts()) <= 2: 
-            df[col] = ord_encoder.fit_transform(df[[col]]) 
+            df[col] = ord_encoder.fit_transform(df[[col]]).astype(int)
        else:
            one_hot_df = one_hot_encoder.fit_transform(df[[col]]) # Use one-hot encoding for categorical variables with more than 2 unique values
            names_cols = one_hot_encoder.get_feature_names_out([col])
-            encoded_df = pd.DataFrame(one_hot_df, columns=names_cols)
+            encoded_df = pd.DataFrame(one_hot_df, columns=names_cols).astype(int)
            df = pd.concat([df, encoded_df], axis=1)
            df.drop(col, axis=1, inplace=True)
@@ -451,10 +452,78 @@ def feature_selection(df, target, threshold_variance_ratio=0.99):
    result_df = pd.DataFrame(X[:, :nb_feats], columns=[f'PCA{i+1}' for i in range(nb_feats)])
    result_df[target] = df[target]
-    return result_df, pca.explained_variance_ratio_
+    return result_df, explained_variance_ratios
-def train_and_tune_model(X_train, y_train, model, param_grid, cv, scoring = "f1_score", verbose=1):
+def display_results(dict_models, X_train, y_train, X_test, y_test, cv, disp_col):
+    """
+    Display the F1 scores of different models in a DataFrame for comparison purposes.
+    Parameters
+    ----------
+    dict_models : dict
+        Contains the models that we want to compare along with their parameter grids.
+    X_train : DataFrame
+        Training data features.
+    y_train : Series
+        Training data target
+    X_test : DataFrame
+        Test data features
+    y_test : Series
+        Test data target
+    cv : StratifiedKFold
+        Cross-validation strategy
+    disp_col : str
+        Name of the column to be displayed
+    Returns
+    -------
+    df_results : DataFrame
+        DataFrame with the F1 scores.
+    """
+    df_results = pd.DataFrame(columns=["Model Name",disp_col])
+    for model_name, model_details in tqdm(dict_models.items(), desc="Going through each model defined in the dictionnary..."):
+        #extract the details related to every model from the dict
+        model_params = model_details["param_grid"]
+        model = model_details["model"]
+        best_model = train_and_tune_model(X_train, y_train, model, model_params, cv)
+        score = test_model(X_test, y_test, best_model) #evaluate f1 score on test data
+        rounded_score = np.round(score*100,2)
+        new_row = {"Model Name": model_name, disp_col: rounded_score}
+        df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)
+    df_results = df_results.style.highlight_max(subset=[disp_col], color='salmon') #highlight the model with the higher f1 score
+    return df_results
+def test_model(X_test, y_test, model):
+    """
+    Evaluate the F1 score of a trained model on the test set.
+    Parameters
+    ----------
+    X_test : DataFrame
+        Test data features.
+    y_test : Series
+        Test data target.
+    model : trained model
+        The model to be evaluated.
+    Returns
+    -------
+    score : float
+        The F1 score of the model on the test set.
+    """
+    y_pred = model.predict(X_test)
+    score = f1_score(y_test, y_pred)
+    return score
+def train_and_tune_model(X_train, y_train, model, param_grid, cv, scoring = "f1", verbose=1):
    """
    Train and fine-tune a binary classification model using GridSearchCV.

--- a/draft_kidney.ipynb
+++ b/draft_kidney.ipynb