Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision

Target

Select target project
  • m21aouad/mini-projet-intro-ml
1 result
Select Git revision
Show changes
......@@ -21,46 +21,192 @@ from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
"""
kideney data
data description : 25 features ( 11 numeric ,14 nominal)
Numerical Data (11):
1. age: Age in years
2. bp: Blood Pressure in mm/Hg
3. bgr: Blood Glucose Random in mg/dL
4. bu: Blood Urea in mg/dL
5. sc: Serum Creatinine in mg/dL
6. sod: Sodium in mEq/L
7. pot: Potassium in mEq/L
8. hemo: Hemoglobin in gms
9. pcv: Packed Cell Volume (unit not specified)
10. wc: White Blood Cell Count in cells/cumm
11. rc: Red Blood Cell Count in millions/cmm
######### Main function for preprocessing ############
def pre_process_data(data):
"""
Pre-process the data using the defined sub functions.
Parameters:
-----------
data (DataFrame): The dataset.
Returns:
----------
data: preprocessed data.
"""
numerical_cols = get_numerical_columns(data)
categorical_cols = get_categorical_columns(data)
scale_normalize(data,numerical_cols)
if categorical_cols != []:
fill_categorical_kidney(data, categorical_cols)
data = convert_categorical_feats(data, categorical_cols)
fill_numerical_columns(data, skew_threshold=0.5)
return data
######### Main function for preparing the data for the training phase ############
def prepare_training_data(data, target, use_pca=True, threshold_variance_ratio=0.90):
"""
Prepare the data for training phase.
Parameters:
-----------
data (DataFrame): The dataset.
"""
if use_pca:
df, explainable_ratios= feature_selection(data, target, threshold_variance_ratio=0.90)
#Explainable plots
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
axes[0].scatter(df.loc[df['classification'] == 0, 'PCA1'], df.loc[df['classification'] == 0, 'PCA2'], color='red', label="CKD")
axes[0].scatter(df.loc[df['classification'] == 1, 'PCA1'], df.loc[df['classification'] == 1, 'PCA2'], color='green', label="NOT CKD")
axes[0].set_xlabel("PCA1: First component")
axes[0].set_ylabel("PCA2: Second component")
axes[0].set_title('Data projection onto the first two PCA components')
axes[0].legend()
axes[1].plot(range(1,len(explainable_ratios)+1), explainable_ratios)
axes[1].axhline(y=0.90, linestyle='--', color='red', label='Threshold Ratio')
axes[1].set_xlabel('Number of components')
axes[1].set_ylabel('Cumulative Explainable variance')
axes[1].set_title("Cumulative Explainable variance vs Number of components")
plt.tight_layout()
plt.subplots_adjust(wspace=0.4)
plt.show()
X_train, X_test, y_train, y_test, cv = split(df, target ,alpha=0.2,n=5)
return X_train, X_test, y_train, y_test, cv
######### Main function for training the data ############
def train_and_tune_model(X_train, y_train, model, param_grid, cv, scoring = "f1", verbose=1):
"""
Train and fine-tune a binary classification model using GridSearchCV.
Parameters
----------
X_train : DataFrame
Training data features.
y_train : Series
Training data target.
model : Estimator object
The binary classification model to be trained.
param_grid : dict
The hyperparameter grid to use for fine-tuning the model.
cv : Cross-validation strategy
The cross-validation splitting strategy.
scoring : str or list of str, optional
The scoring metric(s) to use for evaluation. Default is 'f1-score'.
verbose : int, optional
The verbosity level.
Returns
-------
best_model : Estimator object
The best model found during the GridSearchCV process.
"""
# Setting up the GridSearchCV
grid_search = GridSearchCV(estimator=model,
param_grid=param_grid,
cv=cv,
scoring=scoring,
n_jobs=-1, # Use parallel processing
verbose=verbose) # amount of messaging (information) output
# Fitting the model
grid_search.fit(X_train, y_train)
# Retrieving the best model
best_model = grid_search.best_estimator_
return best_model
# Example usage
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()
# param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
# best_rf_model = train_and_tune_model(X_train, y_train, model, param_grid, cv)
######### Main function to display the results for comparison purposes ############
def display_results(dict_models, X_train, y_train, X_test, y_test, cv, disp_col):
"""
Display the F1 scores of different models in a DataFrame for comparison purposes.
Parameters
----------
dict_models : dict
Contains the models that we want to compare along with their parameter grids.
X_train : DataFrame
Training data features.
y_train : Series
Training data target
X_test : DataFrame
Test data features
y_test : Series
Test data target
cv : StratifiedKFold
Cross-validation strategy
disp_col : str
Name of the column to be displayed
Returns
-------
df_results : DataFrame
DataFrame with the F1 scores.
"""
Nominal Data (14):
1. sg: Specific Gravity (nominal,
categories: 1.005, 1.010, 1.015, 1.020, 1.025)
2. al: Albumin (nominal, categories: 0, 1, 2, 3, 4, 5)
3. su: Sugar (nominal, categories: 0, 1, 2, 3, 4, 5)
4. rbc: Red Blood Cells (nominal, categories: normal, abnormal)
5. pc: Pus Cell (nominal, categories: normal, abnormal)
6. pcc: Pus Cell Clumps (nominal, categories: present, not present)
7. ba: Bacteria (nominal, categories: present, not present)
8. htn: Hypertension (nominal, categories: yes, no)
9. dm: Diabetes Mellitus (nominal, categories: yes, no)
10. cad: Coronary Artery Disease (nominal, categories: yes, no)
11. appet: Appetite (nominal, categories: good, poor)
12. pe: Pedal Edema (nominal, categories: yes, no)
13. ane: Anemia (nominal, categories: yes, no)
14. classification: Class (nominal, categories: ckd, notckd)
df_results = pd.DataFrame(columns=["Model Name",disp_col])
"""
for model_name, model_details in tqdm(dict_models.items(), desc="Going through each model defined in the dictionnary..."):
#extract the details related to every model from the dict
model_params = model_details["param_grid"]
model = model_details["model"]
best_model = train_and_tune_model(X_train, y_train, model, model_params, cv)
score = test_model(X_test, y_test, best_model) #evaluate f1 score on test data
rounded_score = np.round(score*100,2)
new_row = {"Model Name": model_name, disp_col: rounded_score}
df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)
# numerical_columns = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo',
# 'pcv', 'wc', 'rc']
conf_matrix = confusion_matrix(y_test, best_model.predict(X_test))
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title(f'Confusion Matrix - {model_name}')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
# Print and analyze additional evaluation metrics
y_pred = best_model.predict(X_test)
print(f'Model: {model_name}')
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'ROC-AUC: {roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])}')
print('\n')
# Plot learning curves
train_sizes, train_scores, valid_scores = learning_curve(best_model, X_train, y_train, cv=cv, scoring='f1', n_jobs=-1)
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training F1 Score')
plt.plot(train_sizes, np.mean(valid_scores, axis=1), label='Validation F1 Score')
plt.xlabel('Training Examples')
plt.ylabel('F1 Score')
plt.legend()
plt.title(f'Learning Curves - {model_name}')
plt.show()
# Apply styling after creating the DataFrame
df_results = df_results.style.highlight_max(subset=[disp_col], color='salmon') #highlight the model with the higher f1 score
return df_results
# nominal_columns = ['sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'htn',
# 'dm', 'cad', 'appet', 'pe', 'ane', 'classification']
# Sub-Function to Identify Categorical Columns
def get_categorical_columns(data):
......@@ -202,64 +348,6 @@ def has_outliers(df, col, z_score_threshold=3):
z_scores = np.abs(stats.zscore(df[col].dropna()))
return any(z_scores > z_score_threshold)
# def remove_outliers_iqr(df, col):
# """
# Remove outliers from a DataFrame based on the Interquartile Range (IQR) method.
# Parameters
# ----------
# df : Pandas.DataFrame
# The DataFrame containing the data of interest.
# col : str
# Column name from which to remove outliers.
# Returns
# -------
# DataFrame: Modified DataFrame with outliers removed.
# """
# Q1 = df[col].quantile(0.25)
# Q3 = df[col].quantile(0.75)
# IQR = Q3 - Q1
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR
# # Filtering out the outliers
# return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
# skewed_columns_kidney = ['bp', 'age', 'bgr', 'bu', 'sc', 'pot', 'pcv', 'wc', 'rc']
# symmetric_columns_kidney = ['sod', 'hemo']
# def fill_numerical_kidney(df):
# """
# Parameters
# ----------
# df : Pandas.DataFrame
# must be the kidney data frame.
# Returns
# -------
# None. This function fills the numerical columns of the kidney df.
# """
# df.drop(df[df['wc'] == '\t?'].index, inplace=True)
# df.drop(df[df['pcv'] == '\t?'].index, inplace=True)
# df.drop(df[df['rc'] == '\t?'].index, inplace=True)
# for col in skewed_columns_kidney:
# fill_median(df, col)
# for col in symmetric_columns_kidney:
# fill_mean(df, col)
# fill_numerical_kidney(df_kidney)
# df_kidney.info()
# nan_count = df_kidney[df_kidney.isna().any(axis=1)].shape[0]
# print(f"Number of rows : {len(df_kidney)}")
# print(f"Number of rows with at least one NaN value: {nan_count}")
# print(f"{round(nan_count/len(df_kidney) * 100)}% of our rows have at least one"
# f" missing value")
def fill_numerical_columns(df, skew_threshold=0.5):
"""
......@@ -464,78 +552,6 @@ def feature_selection(df, target, threshold_variance_ratio=0.99):
return result_df, explained_variance_ratios
def display_results(dict_models, X_train, y_train, X_test, y_test, cv, disp_col):
"""
Display the F1 scores of different models in a DataFrame for comparison purposes.
Parameters
----------
dict_models : dict
Contains the models that we want to compare along with their parameter grids.
X_train : DataFrame
Training data features.
y_train : Series
Training data target
X_test : DataFrame
Test data features
y_test : Series
Test data target
cv : StratifiedKFold
Cross-validation strategy
disp_col : str
Name of the column to be displayed
Returns
-------
df_results : DataFrame
DataFrame with the F1 scores.
"""
df_results = pd.DataFrame(columns=["Model Name",disp_col])
for model_name, model_details in tqdm(dict_models.items(), desc="Going through each model defined in the dictionnary..."):
#extract the details related to every model from the dict
model_params = model_details["param_grid"]
model = model_details["model"]
best_model = train_and_tune_model(X_train, y_train, model, model_params, cv)
score = test_model(X_test, y_test, best_model) #evaluate f1 score on test data
rounded_score = np.round(score*100,2)
new_row = {"Model Name": model_name, disp_col: rounded_score}
df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)
conf_matrix = confusion_matrix(y_test, best_model.predict(X_test))
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title(f'Confusion Matrix - {model_name}')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
# Print and analyze additional evaluation metrics
y_pred = best_model.predict(X_test)
print(f'Model: {model_name}')
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'ROC-AUC: {roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])}')
print('\n')
# Plot learning curves
train_sizes, train_scores, valid_scores = learning_curve(best_model, X_train, y_train, cv=cv, scoring='f1', n_jobs=-1)
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training F1 Score')
plt.plot(train_sizes, np.mean(valid_scores, axis=1), label='Validation F1 Score')
plt.xlabel('Training Examples')
plt.ylabel('F1 Score')
plt.legend()
plt.title(f'Learning Curves - {model_name}')
plt.show()
# Apply styling after creating the DataFrame
df_results = df_results.style.highlight_max(subset=[disp_col], color='salmon') #highlight the model with the higher f1 score
return df_results
def test_model(X_test, y_test, model):
"""
Evaluate the F1 score of a trained model on the test set.
......@@ -560,54 +576,6 @@ def test_model(X_test, y_test, model):
return score
def train_and_tune_model(X_train, y_train, model, param_grid, cv, scoring = "f1", verbose=1):
"""
Train and fine-tune a binary classification model using GridSearchCV.
Parameters
----------
X_train : DataFrame
Training data features.
y_train : Series
Training data target.
model : Estimator object
The binary classification model to be trained.
param_grid : dict
The hyperparameter grid to use for fine-tuning the model.
cv : Cross-validation strategy
The cross-validation splitting strategy.
scoring : str or list of str, optional
The scoring metric(s) to use for evaluation. Default is 'f1-score'.
verbose : int, optional
The verbosity level.
Returns
-------
best_model : Estimator object
The best model found during the GridSearchCV process.
"""
# Setting up the GridSearchCV
grid_search = GridSearchCV(estimator=model,
param_grid=param_grid,
cv=cv,
scoring=scoring,
n_jobs=-1, # Use parallel processing
verbose=verbose) # amount of messaging (information) output
# Fitting the model
grid_search.fit(X_train, y_train)
# Retrieving the best model
best_model = grid_search.best_estimator_
return best_model
# Example usage
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()
# param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
# best_rf_model = train_and_tune_model(X_train, y_train, model, param_grid, cv)
This diff is collapsed.