Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
Mini Projet Intro ML
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
AOUAD Mohamed, Jad
Mini Projet Intro ML
Compare revisions
6c7a7bfe47320a9f267a4b0bade72cb8c498baf9 to 203c1286832064ac59e65398f819a301718c42f5
Compare revisions
Changes are shown as if the
source
revision was being merged into the
target
revision.
Learn more about comparing revisions.
Source
m21aouad/mini-projet-intro-ml
Select target project
No results found
203c1286832064ac59e65398f819a301718c42f5
Select Git revision
Swap
Target
m21aouad/mini-projet-intro-ml
Select target project
m21aouad/mini-projet-intro-ml
1 result
6c7a7bfe47320a9f267a4b0bade72cb8c498baf9
Select Git revision
Show changes
Only incoming changes from source
Include changes to target since source was created
Compare
Commits on Source (2)
delete unecessary comments and reorganise the order of the functions
· 4a546b84
CHOUMMIKH Meriam
authored
1 year ago
4a546b84
delete unecessary comments and reorganize the order of functions
· 203c1286
CHOUMMIKH Meriam
authored
1 year ago
203c1286
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
binary_classification_workflow.py
+182
-214
182 additions, 214 deletions
binary_classification_workflow.py
main.ipynb
+267
-511
267 additions, 511 deletions
main.ipynb
with
449 additions
and
725 deletions
binary_classification_workflow.py
View file @
203c1286
...
...
@@ -21,46 +21,192 @@ from sklearn.model_selection import learning_curve
from
sklearn.model_selection
import
GridSearchCV
"""
kideney data
data description : 25 features ( 11 numeric ,14 nominal)
Numerical Data (11):
1. age: Age in years
2. bp: Blood Pressure in mm/Hg
3. bgr: Blood Glucose Random in mg/dL
4. bu: Blood Urea in mg/dL
5. sc: Serum Creatinine in mg/dL
6. sod: Sodium in mEq/L
7. pot: Potassium in mEq/L
8. hemo: Hemoglobin in gms
9. pcv: Packed Cell Volume (unit not specified)
10. wc: White Blood Cell Count in cells/cumm
11. rc: Red Blood Cell Count in millions/cmm
######### Main function for preprocessing ############
def
pre_process_data
(
data
):
"""
Pre-process the data using the defined sub functions.
Parameters:
-----------
data (DataFrame): The dataset.
Returns:
----------
data: preprocessed data.
"""
numerical_cols
=
get_numerical_columns
(
data
)
categorical_cols
=
get_categorical_columns
(
data
)
scale_normalize
(
data
,
numerical_cols
)
if
categorical_cols
!=
[]:
fill_categorical_kidney
(
data
,
categorical_cols
)
data
=
convert_categorical_feats
(
data
,
categorical_cols
)
fill_numerical_columns
(
data
,
skew_threshold
=
0.5
)
return
data
######### Main function for preparing the data for the training phase ############
def
prepare_training_data
(
data
,
target
,
use_pca
=
True
,
threshold_variance_ratio
=
0.90
):
"""
Prepare the data for training phase.
Parameters:
-----------
data (DataFrame): The dataset.
"""
if
use_pca
:
df
,
explainable_ratios
=
feature_selection
(
data
,
target
,
threshold_variance_ratio
=
0.90
)
#Explainable plots
fig
,
axes
=
plt
.
subplots
(
1
,
2
,
figsize
=
(
10
,
4
))
axes
[
0
].
scatter
(
df
.
loc
[
df
[
'
classification
'
]
==
0
,
'
PCA1
'
],
df
.
loc
[
df
[
'
classification
'
]
==
0
,
'
PCA2
'
],
color
=
'
red
'
,
label
=
"
CKD
"
)
axes
[
0
].
scatter
(
df
.
loc
[
df
[
'
classification
'
]
==
1
,
'
PCA1
'
],
df
.
loc
[
df
[
'
classification
'
]
==
1
,
'
PCA2
'
],
color
=
'
green
'
,
label
=
"
NOT CKD
"
)
axes
[
0
].
set_xlabel
(
"
PCA1: First component
"
)
axes
[
0
].
set_ylabel
(
"
PCA2: Second component
"
)
axes
[
0
].
set_title
(
'
Data projection onto the first two PCA components
'
)
axes
[
0
].
legend
()
axes
[
1
].
plot
(
range
(
1
,
len
(
explainable_ratios
)
+
1
),
explainable_ratios
)
axes
[
1
].
axhline
(
y
=
0.90
,
linestyle
=
'
--
'
,
color
=
'
red
'
,
label
=
'
Threshold Ratio
'
)
axes
[
1
].
set_xlabel
(
'
Number of components
'
)
axes
[
1
].
set_ylabel
(
'
Cumulative Explainable variance
'
)
axes
[
1
].
set_title
(
"
Cumulative Explainable variance vs Number of components
"
)
plt
.
tight_layout
()
plt
.
subplots_adjust
(
wspace
=
0.4
)
plt
.
show
()
X_train
,
X_test
,
y_train
,
y_test
,
cv
=
split
(
df
,
target
,
alpha
=
0.2
,
n
=
5
)
return
X_train
,
X_test
,
y_train
,
y_test
,
cv
######### Main function for training the data ############
def
train_and_tune_model
(
X_train
,
y_train
,
model
,
param_grid
,
cv
,
scoring
=
"
f1
"
,
verbose
=
1
):
"""
Train and fine-tune a binary classification model using GridSearchCV.
Parameters
----------
X_train : DataFrame
Training data features.
y_train : Series
Training data target.
model : Estimator object
The binary classification model to be trained.
param_grid : dict
The hyperparameter grid to use for fine-tuning the model.
cv : Cross-validation strategy
The cross-validation splitting strategy.
scoring : str or list of str, optional
The scoring metric(s) to use for evaluation. Default is
'
f1-score
'
.
verbose : int, optional
The verbosity level.
Returns
-------
best_model : Estimator object
The best model found during the GridSearchCV process.
"""
# Setting up the GridSearchCV
grid_search
=
GridSearchCV
(
estimator
=
model
,
param_grid
=
param_grid
,
cv
=
cv
,
scoring
=
scoring
,
n_jobs
=-
1
,
# Use parallel processing
verbose
=
verbose
)
# amount of messaging (information) output
# Fitting the model
grid_search
.
fit
(
X_train
,
y_train
)
# Retrieving the best model
best_model
=
grid_search
.
best_estimator_
return
best_model
# Example usage
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()
# param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
# best_rf_model = train_and_tune_model(X_train, y_train, model, param_grid, cv)
######### Main function to display the results for comparison purposes ############
def
display_results
(
dict_models
,
X_train
,
y_train
,
X_test
,
y_test
,
cv
,
disp_col
):
"""
Display the F1 scores of different models in a DataFrame for comparison purposes.
Parameters
----------
dict_models : dict
Contains the models that we want to compare along with their parameter grids.
X_train : DataFrame
Training data features.
y_train : Series
Training data target
X_test : DataFrame
Test data features
y_test : Series
Test data target
cv : StratifiedKFold
Cross-validation strategy
disp_col : str
Name of the column to be displayed
Returns
-------
df_results : DataFrame
DataFrame with the F1 scores.
"""
Nominal Data (14):
1. sg: Specific Gravity (nominal,
categories: 1.005, 1.010, 1.015, 1.020, 1.025)
2. al: Albumin (nominal, categories: 0, 1, 2, 3, 4, 5)
3. su: Sugar (nominal, categories: 0, 1, 2, 3, 4, 5)
4. rbc: Red Blood Cells (nominal, categories: normal, abnormal)
5. pc: Pus Cell (nominal, categories: normal, abnormal)
6. pcc: Pus Cell Clumps (nominal, categories: present, not present)
7. ba: Bacteria (nominal, categories: present, not present)
8. htn: Hypertension (nominal, categories: yes, no)
9. dm: Diabetes Mellitus (nominal, categories: yes, no)
10. cad: Coronary Artery Disease (nominal, categories: yes, no)
11. appet: Appetite (nominal, categories: good, poor)
12. pe: Pedal Edema (nominal, categories: yes, no)
13. ane: Anemia (nominal, categories: yes, no)
14. classification: Class (nominal, categories: ckd, notckd)
df_results
=
pd
.
DataFrame
(
columns
=
[
"
Model Name
"
,
disp_col
])
"""
for
model_name
,
model_details
in
tqdm
(
dict_models
.
items
(),
desc
=
"
Going through each model defined in the dictionnary...
"
):
#extract the details related to every model from the dict
model_params
=
model_details
[
"
param_grid
"
]
model
=
model_details
[
"
model
"
]
best_model
=
train_and_tune_model
(
X_train
,
y_train
,
model
,
model_params
,
cv
)
score
=
test_model
(
X_test
,
y_test
,
best_model
)
#evaluate f1 score on test data
rounded_score
=
np
.
round
(
score
*
100
,
2
)
new_row
=
{
"
Model Name
"
:
model_name
,
disp_col
:
rounded_score
}
df_results
=
pd
.
concat
([
df_results
,
pd
.
DataFrame
([
new_row
])],
ignore_index
=
True
)
# numerical_columns = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo',
# 'pcv', 'wc', 'rc']
conf_matrix
=
confusion_matrix
(
y_test
,
best_model
.
predict
(
X_test
))
plt
.
figure
(
figsize
=
(
8
,
6
))
sns
.
heatmap
(
conf_matrix
,
annot
=
True
,
fmt
=
'
d
'
,
cmap
=
'
Blues
'
,
cbar
=
False
)
plt
.
title
(
f
'
Confusion Matrix -
{
model_name
}
'
)
plt
.
xlabel
(
'
Predicted
'
)
plt
.
ylabel
(
'
True
'
)
plt
.
show
()
# Print and analyze additional evaluation metrics
y_pred
=
best_model
.
predict
(
X_test
)
print
(
f
'
Model:
{
model_name
}
'
)
print
(
f
'
Accuracy:
{
accuracy_score
(
y_test
,
y_pred
)
}
'
)
print
(
f
'
Precision:
{
precision_score
(
y_test
,
y_pred
)
}
'
)
print
(
f
'
Recall:
{
recall_score
(
y_test
,
y_pred
)
}
'
)
print
(
f
'
ROC-AUC:
{
roc_auc_score
(
y_test
,
best_model
.
predict_proba
(
X_test
)[
:
,
1
])
}
'
)
print
(
'
\n
'
)
# Plot learning curves
train_sizes
,
train_scores
,
valid_scores
=
learning_curve
(
best_model
,
X_train
,
y_train
,
cv
=
cv
,
scoring
=
'
f1
'
,
n_jobs
=-
1
)
plt
.
figure
(
figsize
=
(
8
,
6
))
plt
.
plot
(
train_sizes
,
np
.
mean
(
train_scores
,
axis
=
1
),
label
=
'
Training F1 Score
'
)
plt
.
plot
(
train_sizes
,
np
.
mean
(
valid_scores
,
axis
=
1
),
label
=
'
Validation F1 Score
'
)
plt
.
xlabel
(
'
Training Examples
'
)
plt
.
ylabel
(
'
F1 Score
'
)
plt
.
legend
()
plt
.
title
(
f
'
Learning Curves -
{
model_name
}
'
)
plt
.
show
()
# Apply styling after creating the DataFrame
df_results
=
df_results
.
style
.
highlight_max
(
subset
=
[
disp_col
],
color
=
'
salmon
'
)
#highlight the model with the higher f1 score
return
df_results
# nominal_columns = ['sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'htn',
# 'dm', 'cad', 'appet', 'pe', 'ane', 'classification']
# Sub-Function to Identify Categorical Columns
def
get_categorical_columns
(
data
):
...
...
@@ -202,64 +348,6 @@ def has_outliers(df, col, z_score_threshold=3):
z_scores
=
np
.
abs
(
stats
.
zscore
(
df
[
col
].
dropna
()))
return
any
(
z_scores
>
z_score_threshold
)
# def remove_outliers_iqr(df, col):
# """
# Remove outliers from a DataFrame based on the Interquartile Range (IQR) method.
# Parameters
# ----------
# df : Pandas.DataFrame
# The DataFrame containing the data of interest.
# col : str
# Column name from which to remove outliers.
# Returns
# -------
# DataFrame: Modified DataFrame with outliers removed.
# """
# Q1 = df[col].quantile(0.25)
# Q3 = df[col].quantile(0.75)
# IQR = Q3 - Q1
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR
# # Filtering out the outliers
# return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
# skewed_columns_kidney = ['bp', 'age', 'bgr', 'bu', 'sc', 'pot', 'pcv', 'wc', 'rc']
# symmetric_columns_kidney = ['sod', 'hemo']
# def fill_numerical_kidney(df):
# """
# Parameters
# ----------
# df : Pandas.DataFrame
# must be the kidney data frame.
# Returns
# -------
# None. This function fills the numerical columns of the kidney df.
# """
# df.drop(df[df['wc'] == '\t?'].index, inplace=True)
# df.drop(df[df['pcv'] == '\t?'].index, inplace=True)
# df.drop(df[df['rc'] == '\t?'].index, inplace=True)
# for col in skewed_columns_kidney:
# fill_median(df, col)
# for col in symmetric_columns_kidney:
# fill_mean(df, col)
# fill_numerical_kidney(df_kidney)
# df_kidney.info()
# nan_count = df_kidney[df_kidney.isna().any(axis=1)].shape[0]
# print(f"Number of rows : {len(df_kidney)}")
# print(f"Number of rows with at least one NaN value: {nan_count}")
# print(f"{round(nan_count/len(df_kidney) * 100)}% of our rows have at least one"
# f" missing value")
def
fill_numerical_columns
(
df
,
skew_threshold
=
0.5
):
"""
...
...
@@ -464,78 +552,6 @@ def feature_selection(df, target, threshold_variance_ratio=0.99):
return
result_df
,
explained_variance_ratios
def
display_results
(
dict_models
,
X_train
,
y_train
,
X_test
,
y_test
,
cv
,
disp_col
):
"""
Display the F1 scores of different models in a DataFrame for comparison purposes.
Parameters
----------
dict_models : dict
Contains the models that we want to compare along with their parameter grids.
X_train : DataFrame
Training data features.
y_train : Series
Training data target
X_test : DataFrame
Test data features
y_test : Series
Test data target
cv : StratifiedKFold
Cross-validation strategy
disp_col : str
Name of the column to be displayed
Returns
-------
df_results : DataFrame
DataFrame with the F1 scores.
"""
df_results
=
pd
.
DataFrame
(
columns
=
[
"
Model Name
"
,
disp_col
])
for
model_name
,
model_details
in
tqdm
(
dict_models
.
items
(),
desc
=
"
Going through each model defined in the dictionnary...
"
):
#extract the details related to every model from the dict
model_params
=
model_details
[
"
param_grid
"
]
model
=
model_details
[
"
model
"
]
best_model
=
train_and_tune_model
(
X_train
,
y_train
,
model
,
model_params
,
cv
)
score
=
test_model
(
X_test
,
y_test
,
best_model
)
#evaluate f1 score on test data
rounded_score
=
np
.
round
(
score
*
100
,
2
)
new_row
=
{
"
Model Name
"
:
model_name
,
disp_col
:
rounded_score
}
df_results
=
pd
.
concat
([
df_results
,
pd
.
DataFrame
([
new_row
])],
ignore_index
=
True
)
conf_matrix
=
confusion_matrix
(
y_test
,
best_model
.
predict
(
X_test
))
plt
.
figure
(
figsize
=
(
8
,
6
))
sns
.
heatmap
(
conf_matrix
,
annot
=
True
,
fmt
=
'
d
'
,
cmap
=
'
Blues
'
,
cbar
=
False
)
plt
.
title
(
f
'
Confusion Matrix -
{
model_name
}
'
)
plt
.
xlabel
(
'
Predicted
'
)
plt
.
ylabel
(
'
True
'
)
plt
.
show
()
# Print and analyze additional evaluation metrics
y_pred
=
best_model
.
predict
(
X_test
)
print
(
f
'
Model:
{
model_name
}
'
)
print
(
f
'
Accuracy:
{
accuracy_score
(
y_test
,
y_pred
)
}
'
)
print
(
f
'
Precision:
{
precision_score
(
y_test
,
y_pred
)
}
'
)
print
(
f
'
Recall:
{
recall_score
(
y_test
,
y_pred
)
}
'
)
print
(
f
'
ROC-AUC:
{
roc_auc_score
(
y_test
,
best_model
.
predict_proba
(
X_test
)[
:
,
1
])
}
'
)
print
(
'
\n
'
)
# Plot learning curves
train_sizes
,
train_scores
,
valid_scores
=
learning_curve
(
best_model
,
X_train
,
y_train
,
cv
=
cv
,
scoring
=
'
f1
'
,
n_jobs
=-
1
)
plt
.
figure
(
figsize
=
(
8
,
6
))
plt
.
plot
(
train_sizes
,
np
.
mean
(
train_scores
,
axis
=
1
),
label
=
'
Training F1 Score
'
)
plt
.
plot
(
train_sizes
,
np
.
mean
(
valid_scores
,
axis
=
1
),
label
=
'
Validation F1 Score
'
)
plt
.
xlabel
(
'
Training Examples
'
)
plt
.
ylabel
(
'
F1 Score
'
)
plt
.
legend
()
plt
.
title
(
f
'
Learning Curves -
{
model_name
}
'
)
plt
.
show
()
# Apply styling after creating the DataFrame
df_results
=
df_results
.
style
.
highlight_max
(
subset
=
[
disp_col
],
color
=
'
salmon
'
)
#highlight the model with the higher f1 score
return
df_results
def
test_model
(
X_test
,
y_test
,
model
):
"""
Evaluate the F1 score of a trained model on the test set.
...
...
@@ -560,54 +576,6 @@ def test_model(X_test, y_test, model):
return
score
def
train_and_tune_model
(
X_train
,
y_train
,
model
,
param_grid
,
cv
,
scoring
=
"
f1
"
,
verbose
=
1
):
"""
Train and fine-tune a binary classification model using GridSearchCV.
Parameters
----------
X_train : DataFrame
Training data features.
y_train : Series
Training data target.
model : Estimator object
The binary classification model to be trained.
param_grid : dict
The hyperparameter grid to use for fine-tuning the model.
cv : Cross-validation strategy
The cross-validation splitting strategy.
scoring : str or list of str, optional
The scoring metric(s) to use for evaluation. Default is
'
f1-score
'
.
verbose : int, optional
The verbosity level.
Returns
-------
best_model : Estimator object
The best model found during the GridSearchCV process.
"""
# Setting up the GridSearchCV
grid_search
=
GridSearchCV
(
estimator
=
model
,
param_grid
=
param_grid
,
cv
=
cv
,
scoring
=
scoring
,
n_jobs
=-
1
,
# Use parallel processing
verbose
=
verbose
)
# amount of messaging (information) output
# Fitting the model
grid_search
.
fit
(
X_train
,
y_train
)
# Retrieving the best model
best_model
=
grid_search
.
best_estimator_
return
best_model
# Example usage
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()
# param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
# best_rf_model = train_and_tune_model(X_train, y_train, model, param_grid, cv)
This diff is collapsed.
Click to expand it.
main.ipynb
View file @
203c1286
This diff is collapsed.
Click to expand it.