Skip to content
Snippets Groups Projects
Commit 31b9e72b authored by CHOUMMIKH Meriam's avatar CHOUMMIKH Meriam
Browse files

implement function to display results + function to test models

parent 7eaa8cbb
No related branches found
No related tags found
No related merge requests found
......@@ -15,6 +15,7 @@ from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
"""
......@@ -85,7 +86,7 @@ def get_numerical_columns(data):
-----------
list: List of names of numerical columns.
"""
return data.select_dtypes(include=['int64', 'float64']).columns.tolist()
return data.iloc[:,:-1].select_dtypes(include=['int64', 'float64']).columns.tolist()
def visualise_numerical_data(df,columns=None):
......@@ -410,12 +411,12 @@ def convert_categorical_feats(df, categorical_cols):
for col in categorical_cols:
if len(df[col].value_counts()) <= 2:
df[col] = ord_encoder.fit_transform(df[[col]])
df[col] = ord_encoder.fit_transform(df[[col]]).astype(int)
else:
one_hot_df = one_hot_encoder.fit_transform(df[[col]]) # Use one-hot encoding for categorical variables with more than 2 unique values
names_cols = one_hot_encoder.get_feature_names_out([col])
encoded_df = pd.DataFrame(one_hot_df, columns=names_cols)
encoded_df = pd.DataFrame(one_hot_df, columns=names_cols).astype(int)
df = pd.concat([df, encoded_df], axis=1)
df.drop(col, axis=1, inplace=True)
......@@ -451,10 +452,78 @@ def feature_selection(df, target, threshold_variance_ratio=0.99):
result_df = pd.DataFrame(X[:, :nb_feats], columns=[f'PCA{i+1}' for i in range(nb_feats)])
result_df[target] = df[target]
return result_df, pca.explained_variance_ratio_
return result_df, explained_variance_ratios
def train_and_tune_model(X_train, y_train, model, param_grid, cv, scoring = "f1_score", verbose=1):
def display_results(dict_models, X_train, y_train, X_test, y_test, cv, disp_col):
"""
Display the F1 scores of different models in a DataFrame for comparison purposes.
Parameters
----------
dict_models : dict
Contains the models that we want to compare along with their parameter grids.
X_train : DataFrame
Training data features.
y_train : Series
Training data target
X_test : DataFrame
Test data features
y_test : Series
Test data target
cv : StratifiedKFold
Cross-validation strategy
disp_col : str
Name of the column to be displayed
Returns
-------
df_results : DataFrame
DataFrame with the F1 scores.
"""
df_results = pd.DataFrame(columns=["Model Name",disp_col])
for model_name, model_details in tqdm(dict_models.items(), desc="Going through each model defined in the dictionnary..."):
#extract the details related to every model from the dict
model_params = model_details["param_grid"]
model = model_details["model"]
best_model = train_and_tune_model(X_train, y_train, model, model_params, cv)
score = test_model(X_test, y_test, best_model) #evaluate f1 score on test data
rounded_score = np.round(score*100,2)
new_row = {"Model Name": model_name, disp_col: rounded_score}
df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)
df_results = df_results.style.highlight_max(subset=[disp_col], color='salmon') #highlight the model with the higher f1 score
return df_results
def test_model(X_test, y_test, model):
"""
Evaluate the F1 score of a trained model on the test set.
Parameters
----------
X_test : DataFrame
Test data features.
y_test : Series
Test data target.
model : trained model
The model to be evaluated.
Returns
-------
score : float
The F1 score of the model on the test set.
"""
y_pred = model.predict(X_test)
score = f1_score(y_test, y_pred)
return score
def train_and_tune_model(X_train, y_train, model, param_grid, cv, scoring = "f1", verbose=1):
"""
Train and fine-tune a binary classification model using GridSearchCV.
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment