balancing train data and more evaluation metrics

76186a88 · Konstantin Gerd Eyhorn · deaa32b4 · 76186a88
Commit 76186a88 authored Apr 15, 2024 by Konstantin Gerd Eyhorn
--- a/mlp_train.py
+++ b/mlp_train.py
@@ -4,32 +4,66 @@ import numpy as np
 import torch
 from tqdm import trange, tqdm
 from sklearn.model_selection import train_test_split
-import time
 import matplotlib.pyplot as plt
 from sklearn.metrics import confusion_matrix
 import seaborn as sns
+from sklearn.metrics import roc_auc_score
+# S_features_filter = [abs_S_Smin,rel_S_Smin_semi_width,rel_S_Smin_full_width,abs_S_Smax,rel_S_Smax_semi_width,rel_S_Smax_full_width,count_anomalies_S,ratio_anomalies_S,max_variation_S]
+# T_features_filter = [abs_T_Tmin,rel_T_Tmin_semi_width,rel_T_Tmin_full_width,abs_T_Tmax,rel_T_Tmax_semi_width,rel_T_Tmax_full_width,count_anomalies_T,ratio_anomalies_T,max_variation_T]
+# B_features_filter = [mean_correlation,nb_measurements]
+S_features_filter = [
+    "abs_S_Smin",
+    "rel_S_Smin_semi_width",
+    "rel_S_Smin_full_width",
+    "abs_S_Smax",
+    "rel_S_Smax_semi_width",
+    "rel_S_Smax_full_width",
+    "count_anomalies_S",
+    "ratio_anomalies_S",
+    "max_variation_S",
+]
+T_features_filter = [
+    "abs_T_Tmin",
+    "rel_T_Tmin_semi_width",
+    "rel_T_Tmin_full_width",
+    "abs_T_Tmax",
+    "rel_T_Tmax_semi_width",
+    "rel_T_Tmax_full_width",
+    "count_anomalies_T",
+    "ratio_anomalies_T",
+    "max_variation_T",
+]
+B_features_filter = ["mean_correlation", "nb_measurements"]
 PICKLE_PATH = "dataset_pandas/temperature.pkl"
 ##### HYPERPARAMETERS #####
 EPOCHS = 300
-BATCH_SIZE = 16
+BATCH_SIZE = 32
 CRITERION = nn.BCELoss()
 OPTIMIZER = torch.optim.Adam
 LEARNING_RATE = 0.01
 GROWTH_RATE = 16
 DROP_RATE = 0.5
-SCHEDULER_PATIENCE = 20
+SCHEDULER_PATIENCE = 10
 SCHEDULER_FACTOR = 0.5
 SCHEDULER_EPS = 1e-8
+input_features = 11
 class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.block1 = nn.Sequential(
-            nn.Linear(20, GROWTH_RATE), nn.BatchNorm1d(GROWTH_RATE), nn.ReLU()
+            nn.Linear(input_features, GROWTH_RATE),
+            nn.BatchNorm1d(GROWTH_RATE),
+            nn.ReLU(),
        )
        self.block2 = nn.Sequential(
            nn.Linear(GROWTH_RATE, GROWTH_RATE), nn.BatchNorm1d(GROWTH_RATE), nn.ReLU()
@@ -68,25 +102,42 @@ class MLP(nn.Module):
        return y
-def prepare_data() -> tuple[np.ndarray, np.ndarray]:
+def prepare_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    # Load data
    df = pd.read_pickle(PICKLE_PATH)
-    X = df.drop(columns=["alarm"]).to_numpy()
+    # drop Salinity Features
-    y = df["alarm"].to_numpy()
+    df = df.drop(columns=S_features_filter)
-    assert X.shape[1] == 20, "Number of features should be 20"
+    # split the into training and testing sets
-    assert y.shape[0] == X.shape[0], "Number of labels should match number of samples"
+    train_df, test_df = train_test_split(df, test_size=0.2, random_state=123456789)
-    return X, y
+    print(
+        f"Train alarm distribution (befor undersampling): {train_df['alarm'].value_counts()}"
+    )
+    # balance the training set
+    min_alarm = train_df["alarm"].value_counts().min()
+    train_df = pd.concat(
+        [
+            train_df[train_df["alarm"] == 0].sample(min_alarm),
+            train_df[train_df["alarm"] == 1].sample(min_alarm),
+        ]
+    )
+    print(f"Train alarm distribution: {train_df['alarm'].value_counts()}")
-def train_model(X: np.ndarray, y: np.ndarray):
+    X_train = train_df.drop(columns=["alarm"]).values
+    y_train = train_df["alarm"].values
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    X_test = test_df.drop(columns=["alarm"]).values
+    y_test = test_df["alarm"].values
-    # Splitting the data into training and testing sets
+    return X_train, y_train, X_test, y_test
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+def train_model(X_train, y_train, X_test, y_test):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Setting up the data loader
    train_loader = torch.utils.data.DataLoader(
@@ -191,22 +242,45 @@ def train_model(X: np.ndarray, y: np.ndarray):
    # calculate confusion matrix
    cm = confusion_matrix(y_test, y_test_pred_binary)
+    print(cm)
+    # calculate accuracy
+    accuracy = np.sum(np.diag(cm)) / np.sum(cm)
+    print(f"Accuracy: {accuracy}")
+    # print recall
+    recall = cm[1, 1] / (cm[1, 0] + cm[1, 1])
+    print(f"Recall: {recall}")
+    # print precision
+    precision = cm[1, 1] / (cm[0, 1] + cm[1, 1])
+    print(f"Precision: {precision}")
+    # print F1 score
+    f1 = 2 * (precision * recall) / (precision + recall)
+    print(f"F1 score: {f1}")
+    # print F2 score
+    f2 = 5 * (precision * recall) / (4 * precision + recall)
+    print(f"F2 score: {f2}")
+    # print AUC
+    auc = roc_auc_score(y_test, y_test_pred)
+    print(f"AUC: {auc}")
    # plot confusion matrix using seaborn
    sns.heatmap(cm, annot=True, fmt="d")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()
-    # calculate accuracy
-    accuracy = np.sum(np.diag(cm)) / np.sum(cm)
-    print(f"Accuracy: {accuracy}")
    return model
 def main():
-    X, y = prepare_data()
+    X_train, y_train, X_test, y_test = prepare_data()
-    model = train_model(X, y)
+    model = train_model(X_train, y_train, X_test, y_test)
    # print parameter count of model
    print(f"Parameter count: {sum(p.numel() for p in model.parameters())}")