Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
Mini Projet Intro ML
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
AOUAD Mohamed, Jad
Mini Projet Intro ML
Commits
31b9e72b
Commit
31b9e72b
authored
1 year ago
by
CHOUMMIKH Meriam
Browse files
Options
Downloads
Patches
Plain Diff
implement function to display results + function to test models
parent
7eaa8cbb
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
binary_classification_workflow.py
+74
-5
74 additions, 5 deletions
binary_classification_workflow.py
draft_kidney.ipynb
+186
-83
186 additions, 83 deletions
draft_kidney.ipynb
with
260 additions
and
88 deletions
binary_classification_workflow.py
+
74
−
5
View file @
31b9e72b
...
...
@@ -15,6 +15,7 @@ from tqdm import tqdm
from
sklearn.preprocessing
import
MinMaxScaler
,
StandardScaler
,
OrdinalEncoder
,
OneHotEncoder
from
sklearn.model_selection
import
train_test_split
,
StratifiedKFold
,
GridSearchCV
from
sklearn.decomposition
import
PCA
from
sklearn.metrics
import
f1_score
"""
...
...
@@ -85,7 +86,7 @@ def get_numerical_columns(data):
-----------
list: List of names of numerical columns.
"""
return
data
.
select_dtypes
(
include
=
[
'
int64
'
,
'
float64
'
]).
columns
.
tolist
()
return
data
.
iloc
[:,:
-
1
].
select_dtypes
(
include
=
[
'
int64
'
,
'
float64
'
]).
columns
.
tolist
()
def
visualise_numerical_data
(
df
,
columns
=
None
):
...
...
@@ -410,12 +411,12 @@ def convert_categorical_feats(df, categorical_cols):
for
col
in
categorical_cols
:
if
len
(
df
[
col
].
value_counts
())
<=
2
:
df
[
col
]
=
ord_encoder
.
fit_transform
(
df
[[
col
]])
df
[
col
]
=
ord_encoder
.
fit_transform
(
df
[[
col
]])
.
astype
(
int
)
else
:
one_hot_df
=
one_hot_encoder
.
fit_transform
(
df
[[
col
]])
# Use one-hot encoding for categorical variables with more than 2 unique values
names_cols
=
one_hot_encoder
.
get_feature_names_out
([
col
])
encoded_df
=
pd
.
DataFrame
(
one_hot_df
,
columns
=
names_cols
)
encoded_df
=
pd
.
DataFrame
(
one_hot_df
,
columns
=
names_cols
)
.
astype
(
int
)
df
=
pd
.
concat
([
df
,
encoded_df
],
axis
=
1
)
df
.
drop
(
col
,
axis
=
1
,
inplace
=
True
)
...
...
@@ -451,10 +452,78 @@ def feature_selection(df, target, threshold_variance_ratio=0.99):
result_df
=
pd
.
DataFrame
(
X
[:,
:
nb_feats
],
columns
=
[
f
'
PCA
{
i
+
1
}
'
for
i
in
range
(
nb_feats
)])
result_df
[
target
]
=
df
[
target
]
return
result_df
,
pca
.
explained_variance_ratio
_
return
result_df
,
explained_variance_ratio
s
def
train_and_tune_model
(
X_train
,
y_train
,
model
,
param_grid
,
cv
,
scoring
=
"
f1_score
"
,
verbose
=
1
):
def
display_results
(
dict_models
,
X_train
,
y_train
,
X_test
,
y_test
,
cv
,
disp_col
):
"""
Display the F1 scores of different models in a DataFrame for comparison purposes.
Parameters
----------
dict_models : dict
Contains the models that we want to compare along with their parameter grids.
X_train : DataFrame
Training data features.
y_train : Series
Training data target
X_test : DataFrame
Test data features
y_test : Series
Test data target
cv : StratifiedKFold
Cross-validation strategy
disp_col : str
Name of the column to be displayed
Returns
-------
df_results : DataFrame
DataFrame with the F1 scores.
"""
df_results
=
pd
.
DataFrame
(
columns
=
[
"
Model Name
"
,
disp_col
])
for
model_name
,
model_details
in
tqdm
(
dict_models
.
items
(),
desc
=
"
Going through each model defined in the dictionnary...
"
):
#extract the details related to every model from the dict
model_params
=
model_details
[
"
param_grid
"
]
model
=
model_details
[
"
model
"
]
best_model
=
train_and_tune_model
(
X_train
,
y_train
,
model
,
model_params
,
cv
)
score
=
test_model
(
X_test
,
y_test
,
best_model
)
#evaluate f1 score on test data
rounded_score
=
np
.
round
(
score
*
100
,
2
)
new_row
=
{
"
Model Name
"
:
model_name
,
disp_col
:
rounded_score
}
df_results
=
pd
.
concat
([
df_results
,
pd
.
DataFrame
([
new_row
])],
ignore_index
=
True
)
df_results
=
df_results
.
style
.
highlight_max
(
subset
=
[
disp_col
],
color
=
'
salmon
'
)
#highlight the model with the higher f1 score
return
df_results
def
test_model
(
X_test
,
y_test
,
model
):
"""
Evaluate the F1 score of a trained model on the test set.
Parameters
----------
X_test : DataFrame
Test data features.
y_test : Series
Test data target.
model : trained model
The model to be evaluated.
Returns
-------
score : float
The F1 score of the model on the test set.
"""
y_pred
=
model
.
predict
(
X_test
)
score
=
f1_score
(
y_test
,
y_pred
)
return
score
def
train_and_tune_model
(
X_train
,
y_train
,
model
,
param_grid
,
cv
,
scoring
=
"
f1
"
,
verbose
=
1
):
"""
Train and fine-tune a binary classification model using GridSearchCV.
...
...
This diff is collapsed.
Click to expand it.
draft_kidney.ipynb
+
186
−
83
View file @
31b9e72b
Source diff could not be displayed: it is too large. Options to address this:
view the blob
.
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment