Skip to content
Snippets Groups Projects
Commit 6c7a7bfe authored by SAIDI Mohamed's avatar SAIDI Mohamed
Browse files
parents c974e931 e77f9b15
No related branches found
No related tags found
No related merge requests found
Source diff could not be displayed: it is too large. Options to address this: view the blob.
{
"julia.environmentPath": "/Users/ilyaschahed/git/mini-projet-intro-ml"
}
\ No newline at end of file
File deleted
......@@ -16,11 +16,9 @@ from sklearn.preprocessing import MinMaxScaler, StandardScaler, OrdinalEncoder,
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from binary_classification_workflow import *
"""
......@@ -92,6 +90,9 @@ def get_numerical_columns(data):
-----------
list: List of names of numerical columns.
"""
if 'id' in data.columns:
data.drop(columns = ['id'], inplace=True) #Not a relevant column for the classification
return data.iloc[:,:-1].select_dtypes(include=['int64', 'float64']).columns.tolist()
......@@ -421,13 +422,14 @@ def convert_categorical_feats(df, categorical_cols):
df[col] = ord_encoder.fit_transform(df[[col]]).astype(int)
else:
one_hot_df = one_hot_encoder.fit_transform(df[[col]]) # Use one-hot encoding for categorical variables with more than 2 unique values
one_hot_df = one_hot_encoder.fit_transform(df[[col]])
names_cols = one_hot_encoder.get_feature_names_out([col])
encoded_df = pd.DataFrame(one_hot_df, columns=names_cols).astype(int)
df = pd.concat([df, encoded_df], axis=1)
df.drop(col, axis=1, inplace=True)
return df
return df
def feature_selection(df, target, threshold_variance_ratio=0.99):
......
This diff is collapsed.
This diff is collapsed.
import pandas as pd
import unittest
from binary_classification_workflow import *
class UnitTests(unittest.TestCase):
def set_up_df(self):
# Load the first 100 samples of the kidney dataset for testing
self.df = pd.read_csv('./data/kidney_disease.csv').head(100)
# Test if the resulting categorical columns are as expected
def test_get_categorical_columns(self):
categorical_columns = get_categorical_columns(self.df)
expected_columns = ['rbc', 'pc', 'pcc', 'ba', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'classification']
self.assertEqual(categorical_columns, expected_columns)
# Test if the resulting numerical columns are as expected
def test_get_numerical_columns(self):
numerical_columns = get_numerical_columns(self.df)
expected_columns = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']
self.assertFalse('id' in self.df.columns) # Ensure 'id' column is not present
self.assertEqual(numerical_columns, expected_columns)
# Test if filling missing values with median works as expected
def test_fill_median(self):
fill_median(self.df, 'bgr')
self.assertFalse(self.df['bgr'].isnull().any())
def test_fill_mean(self):
fill_mean(self.df, 'sod')
self.assertFalse(self.df['sod'].isnull().any())
def test_is_skewed(self):
skewed = is_skewed(self.df, 'hemo')
self.assertFalse(skewed) # 'hemo' column is not skewed
# Test if the number of columns in the resulting dataframe after encoding the categorical features is greater than the initial
# number of columns
def test_encode_categorical(self):
categorical_cols = get_categorical_columns(self.df)
result_df = convert_categorical_feats(self.df, categorical_cols)
self.assertTrue(len(result_df.columns) > len(self.df.columns))
if __name__ == '__main__':
unittest.main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment