Skip to content
Snippets Groups Projects
Commit 31b9e72b authored by CHOUMMIKH Meriam's avatar CHOUMMIKH Meriam
Browse files

implement function to display results + function to test models

parent 7eaa8cbb
Branches
No related tags found
No related merge requests found
...@@ -15,6 +15,7 @@ from tqdm import tqdm ...@@ -15,6 +15,7 @@ from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OrdinalEncoder, OneHotEncoder from sklearn.preprocessing import MinMaxScaler, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
""" """
...@@ -85,7 +86,7 @@ def get_numerical_columns(data): ...@@ -85,7 +86,7 @@ def get_numerical_columns(data):
----------- -----------
list: List of names of numerical columns. list: List of names of numerical columns.
""" """
return data.select_dtypes(include=['int64', 'float64']).columns.tolist() return data.iloc[:,:-1].select_dtypes(include=['int64', 'float64']).columns.tolist()
def visualise_numerical_data(df,columns=None): def visualise_numerical_data(df,columns=None):
...@@ -410,12 +411,12 @@ def convert_categorical_feats(df, categorical_cols): ...@@ -410,12 +411,12 @@ def convert_categorical_feats(df, categorical_cols):
for col in categorical_cols: for col in categorical_cols:
if len(df[col].value_counts()) <= 2: if len(df[col].value_counts()) <= 2:
df[col] = ord_encoder.fit_transform(df[[col]]) df[col] = ord_encoder.fit_transform(df[[col]]).astype(int)
else: else:
one_hot_df = one_hot_encoder.fit_transform(df[[col]]) # Use one-hot encoding for categorical variables with more than 2 unique values one_hot_df = one_hot_encoder.fit_transform(df[[col]]) # Use one-hot encoding for categorical variables with more than 2 unique values
names_cols = one_hot_encoder.get_feature_names_out([col]) names_cols = one_hot_encoder.get_feature_names_out([col])
encoded_df = pd.DataFrame(one_hot_df, columns=names_cols) encoded_df = pd.DataFrame(one_hot_df, columns=names_cols).astype(int)
df = pd.concat([df, encoded_df], axis=1) df = pd.concat([df, encoded_df], axis=1)
df.drop(col, axis=1, inplace=True) df.drop(col, axis=1, inplace=True)
...@@ -451,10 +452,78 @@ def feature_selection(df, target, threshold_variance_ratio=0.99): ...@@ -451,10 +452,78 @@ def feature_selection(df, target, threshold_variance_ratio=0.99):
result_df = pd.DataFrame(X[:, :nb_feats], columns=[f'PCA{i+1}' for i in range(nb_feats)]) result_df = pd.DataFrame(X[:, :nb_feats], columns=[f'PCA{i+1}' for i in range(nb_feats)])
result_df[target] = df[target] result_df[target] = df[target]
return result_df, pca.explained_variance_ratio_ return result_df, explained_variance_ratios
def train_and_tune_model(X_train, y_train, model, param_grid, cv, scoring = "f1_score", verbose=1): def display_results(dict_models, X_train, y_train, X_test, y_test, cv, disp_col):
"""
Display the F1 scores of different models in a DataFrame for comparison purposes.
Parameters
----------
dict_models : dict
Contains the models that we want to compare along with their parameter grids.
X_train : DataFrame
Training data features.
y_train : Series
Training data target
X_test : DataFrame
Test data features
y_test : Series
Test data target
cv : StratifiedKFold
Cross-validation strategy
disp_col : str
Name of the column to be displayed
Returns
-------
df_results : DataFrame
DataFrame with the F1 scores.
"""
df_results = pd.DataFrame(columns=["Model Name",disp_col])
for model_name, model_details in tqdm(dict_models.items(), desc="Going through each model defined in the dictionnary..."):
#extract the details related to every model from the dict
model_params = model_details["param_grid"]
model = model_details["model"]
best_model = train_and_tune_model(X_train, y_train, model, model_params, cv)
score = test_model(X_test, y_test, best_model) #evaluate f1 score on test data
rounded_score = np.round(score*100,2)
new_row = {"Model Name": model_name, disp_col: rounded_score}
df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)
df_results = df_results.style.highlight_max(subset=[disp_col], color='salmon') #highlight the model with the higher f1 score
return df_results
def test_model(X_test, y_test, model):
"""
Evaluate the F1 score of a trained model on the test set.
Parameters
----------
X_test : DataFrame
Test data features.
y_test : Series
Test data target.
model : trained model
The model to be evaluated.
Returns
-------
score : float
The F1 score of the model on the test set.
"""
y_pred = model.predict(X_test)
score = f1_score(y_test, y_pred)
return score
def train_and_tune_model(X_train, y_train, model, param_grid, cv, scoring = "f1", verbose=1):
""" """
Train and fine-tune a binary classification model using GridSearchCV. Train and fine-tune a binary classification model using GridSearchCV.
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment