Merge branch 'main' of https://gitlab.imt-atlantique.fr/m21aouad/mini-projet-intro-ml

6c7a7bfe · SAIDI Mohamed · c974e931 · e77f9b15 · c974e931 · c974e931
Commit 6c7a7bfe authored 1 year ago by SAIDI Mohamed
--- a/.ipynb_checkpoints/draft_kidney-checkpoint.ipynb
+++ b/.ipynb_checkpoints/draft_kidney-checkpoint.ipynb
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
-{
-    "julia.environmentPath": "/Users/ilyaschahed/git/mini-projet-intro-ml"
-}
\ No newline at end of file
--- a/__pycache__/binary_classification_workflow.cpython-311.pyc
+++ b/__pycache__/binary_classification_workflow.cpython-311.pyc
--- a/binary_classification_workflow.py
+++ b/binary_classification_workflow.py
@@ -16,11 +16,9 @@ from sklearn.preprocessing import MinMaxScaler, StandardScaler, OrdinalEncoder,
 from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
 from sklearn.decomposition import PCA
 from sklearn.metrics import f1_score
-from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
 from sklearn.model_selection import learning_curve
 from sklearn.model_selection import GridSearchCV
-from binary_classification_workflow import *


 """
@@ -92,6 +90,9 @@ def get_numerical_columns(data):
    -----------
    list: List of names of numerical columns.
    """
+    if 'id' in data.columns:
+        data.drop(columns = ['id'], inplace=True) #Not a relevant column for the classification
+
    return data.iloc[:,:-1].select_dtypes(include=['int64', 'float64']).columns.tolist()


@@ -421,13 +422,14 @@ def convert_categorical_feats(df, categorical_cols):
            df[col] = ord_encoder.fit_transform(df[[col]]).astype(int)

        else:
-            one_hot_df = one_hot_encoder.fit_transform(df[[col]]) # Use one-hot encoding for categorical variables with more than 2 unique values
+            one_hot_df = one_hot_encoder.fit_transform(df[[col]])
            names_cols = one_hot_encoder.get_feature_names_out([col])
            encoded_df = pd.DataFrame(one_hot_df, columns=names_cols).astype(int)
            df = pd.concat([df, encoded_df], axis=1)
            df.drop(col, axis=1, inplace=True)

-    return df
+    return df 
+


 def feature_selection(df, target, threshold_variance_ratio=0.99):

--- a/draft_kidney.ipynb
+++ b/draft_kidney.ipynb
--- a/main.ipynb
+++ b/main.ipynb
--- a/unit_tests.py
+++ b/unit_tests.py
+import pandas as pd
+import unittest
+from binary_classification_workflow import *
+
+class UnitTests(unittest.TestCase):
+    
+    def set_up_df(self):
+        # Load the first 100 samples of the kidney dataset for testing
+        self.df = pd.read_csv('./data/kidney_disease.csv').head(100)
+    
+    # Test if the resulting categorical columns are as expected
+    def test_get_categorical_columns(self):
+        categorical_columns = get_categorical_columns(self.df)
+        expected_columns = ['rbc', 'pc', 'pcc', 'ba', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'classification']
+        self.assertEqual(categorical_columns, expected_columns)
+    
+    # Test if the resulting numerical columns are as expected
+    def test_get_numerical_columns(self):
+        numerical_columns = get_numerical_columns(self.df)
+        expected_columns = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']
+        self.assertFalse('id' in self.df.columns)  # Ensure 'id' column is not present
+        self.assertEqual(numerical_columns, expected_columns)
+    
+    # Test if filling missing values with median works as expected
+    def test_fill_median(self):
+        fill_median(self.df, 'bgr')
+        self.assertFalse(self.df['bgr'].isnull().any()) 
+    
+    def test_fill_mean(self):
+        fill_mean(self.df, 'sod')
+        self.assertFalse(self.df['sod'].isnull().any()) 
+    
+    def test_is_skewed(self):
+        skewed = is_skewed(self.df, 'hemo')
+        self.assertFalse(skewed)  # 'hemo' column is not skewed
+    
+    # Test if the number of columns in the resulting dataframe after encoding the categorical features is greater than the initial 
+    # number of columns
+    def test_encode_categorical(self):
+        categorical_cols = get_categorical_columns(self.df)
+        result_df = convert_categorical_feats(self.df, categorical_cols)
+        self.assertTrue(len(result_df.columns) > len(self.df.columns))
+
+if __name__ == '__main__':
+    unittest.main()