Merge branch 'mlp_konstantin'

d5c7ae90 · Konstantin Gerd Eyhorn · af784fdf · 76186a88 · d5c7ae90 · d5c7ae90
Commit d5c7ae90 authored 1 year ago by Konstantin Gerd Eyhorn
--- a/.gitignore
+++ b/.gitignore
+# Pytorch Checkpoints
+*.pth 
\ No newline at end of file
--- a/checkpoints/.gitkeep
+++ b/checkpoints/.gitkeep
--- a/environment.yaml
+++ b/environment.yaml
@@ -24,6 +24,7 @@ dependencies:
  - cf_xarray=0.9.0=pyhd8ed1ab_0
  - cftime=1.6.3=py311h1f0f07a_0
  - charset-normalizer=3.3.2=pyhd8ed1ab_0
+  - colorama=0.4.6=pyhd8ed1ab_0
  - comm=0.2.2=pyhd8ed1ab_0
  - contourpy=1.2.0=py311h9547e67_0
  - cuda-cudart=11.8.89=0
@@ -71,13 +72,18 @@ dependencies:
  - importlib-metadata=7.1.0=pyha770c72_0
  - importlib_metadata=7.1.0=hd8ed1ab_0
  - ipykernel=6.29.3=pyhd33586a_0
+  - ipympl=0.9.3=pyhd8ed1ab_0
  - ipython=8.22.2=pyh707e725_0
+  - ipython_genutils=0.2.0=py_1
+  - ipywidgets=8.1.2=pyhd8ed1ab_0
  - jack=1.9.22=h11f4161_0
  - jedi=0.19.1=pyhd8ed1ab_0
  - jinja2=3.1.3=pyhd8ed1ab_0
+  - joblib=1.4.0=pyhd8ed1ab_0
  - jpeg=9e=h166bdaf_2
  - jupyter_client=8.6.1=pyhd8ed1ab_0
  - jupyter_core=5.7.2=py311h38be061_0
+  - jupyterlab_widgets=3.0.10=pyhd8ed1ab_0
  - keyutils=1.6.1=h166bdaf_0
  - kiwisolver=1.4.5=py311h9547e67_1
  - krb5=1.20.1=h81ceb04_0
@@ -180,6 +186,7 @@ dependencies:
  - packaging=24.0=pyhd8ed1ab_0
  - pandas=2.2.1=py311h320fe9a_0
  - parso=0.8.3=pyhd8ed1ab_0
+  - patsy=0.5.6=pyhd8ed1ab_0
  - pcre2=10.43=hcad00b1_0
  - pexpect=4.9.0=pyhd8ed1ab_0
  - pickleshare=0.7.5=py_1003
@@ -217,15 +224,20 @@ dependencies:
  - qt-main=5.15.8=h5d23da1_6
  - readline=8.2=h8228510_1
  - requests=2.31.0=pyhd8ed1ab_0
+  - scikit-learn=1.4.2=py311hc009520_0
  - scipy=1.12.0=py311h64a7726_2
+  - seaborn=0.13.2=hd8ed1ab_0
+  - seaborn-base=0.13.2=pyhd8ed1ab_0
  - setuptools=69.2.0=pyhd8ed1ab_0
  - shapely=2.0.3=py311h2032efe_0
  - sip=6.7.12=py311hb755f60_0
  - six=1.16.0=pyh6c4a22f_0
  - sqlite=3.45.2=h2c6b66d_0
  - stack_data=0.6.2=pyhd8ed1ab_0
+  - statsmodels=0.14.1=py311h1f0f07a_0
  - sympy=1.12=pypyh9d50eac_103
  - tbb=2021.9.0=hf52228f_0
+  - threadpoolctl=3.4.0=pyhc1e730c_0
  - tk=8.6.13=noxft_h4845f30_101
  - toml=0.10.2=pyhd8ed1ab_0
  - tomli=2.0.1=pyhd8ed1ab_0
@@ -233,12 +245,14 @@ dependencies:
  - torchtriton=2.2.0=py311
  - torchvision=0.17.2=py311_cu118
  - tornado=6.4=py311h459d7ec_0
+  - tqdm=4.66.2=pyhd8ed1ab_0
  - traitlets=5.14.2=pyhd8ed1ab_0
  - typing_extensions=4.10.0=pyha770c72_0
  - tzdata=2024a=h0c530f3_0
  - urllib3=2.2.1=pyhd8ed1ab_0
  - wcwidth=0.2.13=pyhd8ed1ab_0
  - wheel=0.43.0=pyhd8ed1ab_1
+  - widgetsnbextension=4.0.10=pyhd8ed1ab_0
  - xarray=2024.3.0=pyhd8ed1ab_0
  - xcb-util=0.4.0=h516909a_0
  - xcb-util-image=0.4.0=h166bdaf_0

--- a/mlp_train.py
+++ b/mlp_train.py
+import torch.nn as nn
+import pandas as pd
+import numpy as np
+import torch
+from tqdm import trange, tqdm
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+import seaborn as sns
+from sklearn.metrics import roc_auc_score
+
+
+# S_features_filter = [abs_S_Smin,rel_S_Smin_semi_width,rel_S_Smin_full_width,abs_S_Smax,rel_S_Smax_semi_width,rel_S_Smax_full_width,count_anomalies_S,ratio_anomalies_S,max_variation_S]
+# T_features_filter = [abs_T_Tmin,rel_T_Tmin_semi_width,rel_T_Tmin_full_width,abs_T_Tmax,rel_T_Tmax_semi_width,rel_T_Tmax_full_width,count_anomalies_T,ratio_anomalies_T,max_variation_T]
+# B_features_filter = [mean_correlation,nb_measurements]
+
+S_features_filter = [
+    "abs_S_Smin",
+    "rel_S_Smin_semi_width",
+    "rel_S_Smin_full_width",
+    "abs_S_Smax",
+    "rel_S_Smax_semi_width",
+    "rel_S_Smax_full_width",
+    "count_anomalies_S",
+    "ratio_anomalies_S",
+    "max_variation_S",
+]
+T_features_filter = [
+    "abs_T_Tmin",
+    "rel_T_Tmin_semi_width",
+    "rel_T_Tmin_full_width",
+    "abs_T_Tmax",
+    "rel_T_Tmax_semi_width",
+    "rel_T_Tmax_full_width",
+    "count_anomalies_T",
+    "ratio_anomalies_T",
+    "max_variation_T",
+]
+B_features_filter = ["mean_correlation", "nb_measurements"]
+
+PICKLE_PATH = "dataset_pandas/temperature.pkl"
+
+
+##### HYPERPARAMETERS #####
+EPOCHS = 300
+BATCH_SIZE = 32
+CRITERION = nn.BCELoss()
+OPTIMIZER = torch.optim.Adam
+LEARNING_RATE = 0.01
+GROWTH_RATE = 16
+DROP_RATE = 0.5
+SCHEDULER_PATIENCE = 10
+SCHEDULER_FACTOR = 0.5
+SCHEDULER_EPS = 1e-8
+
+
+input_features = 11
+
+
+class MLP(nn.Module):
+    def __init__(self):
+        super(MLP, self).__init__()
+        self.block1 = nn.Sequential(
+            nn.Linear(input_features, GROWTH_RATE),
+            nn.BatchNorm1d(GROWTH_RATE),
+            nn.ReLU(),
+        )
+        self.block2 = nn.Sequential(
+            nn.Linear(GROWTH_RATE, GROWTH_RATE), nn.BatchNorm1d(GROWTH_RATE), nn.ReLU()
+        )
+        self.block3 = nn.Sequential(
+            nn.Linear(2 * GROWTH_RATE, GROWTH_RATE),
+            nn.BatchNorm1d(GROWTH_RATE),
+            nn.ReLU(),
+        )
+        self.block4 = nn.Sequential(
+            nn.Linear(3 * GROWTH_RATE, GROWTH_RATE),
+            nn.BatchNorm1d(GROWTH_RATE),
+            nn.ReLU(),
+        )
+        self.block5 = nn.Sequential(
+            nn.Linear(4 * GROWTH_RATE, GROWTH_RATE),
+            nn.BatchNorm1d(GROWTH_RATE),
+            nn.ReLU(),
+        )
+        self.block6 = nn.Sequential(
+            nn.Linear(5 * GROWTH_RATE, GROWTH_RATE),
+            nn.BatchNorm1d(GROWTH_RATE),
+            nn.ReLU(),
+        )
+        self.output = nn.Linear(GROWTH_RATE, 1)
+
+    def forward(self, x):
+        x1 = self.block1(x)
+        x2 = self.block2(x1)
+        x3 = self.block3(torch.cat([x1, x2], dim=1))
+        x4 = self.block4(torch.cat([x1, x2, x3], dim=1))
+        x5 = self.block5(torch.cat([x1, x2, x3, x4], dim=1))
+        x6 = self.block6(torch.cat([x1, x2, x3, x4, x5], dim=1))
+        y = self.output(x6)
+        y = torch.sigmoid(y)
+        return y
+
+
+def prepare_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    # Load data
+    df = pd.read_pickle(PICKLE_PATH)
+
+    # drop Salinity Features
+    df = df.drop(columns=S_features_filter)
+
+    # split the into training and testing sets
+    train_df, test_df = train_test_split(df, test_size=0.2, random_state=123456789)
+
+    print(
+        f"Train alarm distribution (befor undersampling): {train_df['alarm'].value_counts()}"
+    )
+
+    # balance the training set
+    min_alarm = train_df["alarm"].value_counts().min()
+    train_df = pd.concat(
+        [
+            train_df[train_df["alarm"] == 0].sample(min_alarm),
+            train_df[train_df["alarm"] == 1].sample(min_alarm),
+        ]
+    )
+    print(f"Train alarm distribution: {train_df['alarm'].value_counts()}")
+
+    X_train = train_df.drop(columns=["alarm"]).values
+    y_train = train_df["alarm"].values
+
+    X_test = test_df.drop(columns=["alarm"]).values
+    y_test = test_df["alarm"].values
+
+    return X_train, y_train, X_test, y_test
+
+
+def train_model(X_train, y_train, X_test, y_test):
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Setting up the data loader
+    train_loader = torch.utils.data.DataLoader(
+        list(zip(X_train, y_train)), batch_size=BATCH_SIZE, shuffle=True
+    )
+
+    # Define model
+    model = MLP().to(device)
+
+    model.dropout = nn.Dropout(DROP_RATE)
+
+    # Define loss function
+    criterion = CRITERION
+
+    # Define optimizer
+    optimizer = OPTIMIZER(model.parameters(), lr=LEARNING_RATE)
+
+    # Define a Scheduler
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer,
+        mode="min",
+        factor=SCHEDULER_FACTOR,
+        patience=SCHEDULER_PATIENCE,
+        eps=SCHEDULER_EPS,
+    )
+
+    # Train model
+    train_losses = []
+    test_losses = []
+    for epoch in range(EPOCHS):
+        epoch_train_loss = 0
+        with tqdm(train_loader, unit="batch") as t:
+            for data, target in t:
+                t.set_description(f"Epoch {str(epoch).rjust(5)}")
+
+                # Move data to device
+                data, target = data.to(device), target.to(device)
+
+                # Zero the gradients
+                optimizer.zero_grad()
+
+                output = model(data.float())
+
+                # Calculate loss
+                loss = criterion(output, target.float().view(-1, 1))
+
+                epoch_train_loss += loss.item()
+
+                # Backpropagation
+                loss.backward()
+
+                # Update weights
+                optimizer.step()
+
+                # Display loss
+                t.set_postfix(train_loss=f"{loss.item():.4f}")
+
+            scheduler.step(loss)
+            # print optimizer learning rate
+            print(f"Learning rate: {optimizer.param_groups[0]['lr']}")
+
+            # compute train loss
+            epoch_train_loss /= len(train_loader)
+
+            # Evaluate model on test set
+            y_test_pred = (
+                model(torch.tensor(X_test).float().to(device)).cpu().detach().numpy()
+            )
+            test_loss = criterion(
+                torch.tensor(y_test_pred).float(),
+                torch.tensor(y_test).float().view(-1, 1),
+            )
+
+            print(f"Train loss: {epoch_train_loss:.4f}")
+            print(f"Test loss: {test_loss.item():.4f}")
+            test_losses.append(test_loss.item())
+            train_losses.append(epoch_train_loss)
+
+            # save model if test loss has decreased
+            if len(test_losses) == 1 or test_loss < min(test_losses[:-1]):
+                torch.save(
+                    model.state_dict(),
+                    f"checkpoints/mlp_{epoch}.pth",
+                )
+
+    # Plot losses
+    sns.lineplot(x=range(len(train_losses)), y=train_losses, label="Train loss")
+    sns.lineplot(x=range(len(test_losses)), y=test_losses, label="Test loss")
+    plt.xlabel("Epoch")
+    plt.ylabel("Loss")
+    plt.legend()
+    plt.show()
+
+    # load best model from checkpoint
+    print(f"Loading model from checkpoint: mlp_{np.argmin(test_losses)}.pth")
+    model.load_state_dict(torch.load(f"checkpoints/mlp_{np.argmin(test_losses)}.pth"))
+
+    # predict on test set
+    y_test_pred = model(torch.tensor(X_test).float().to(device)).cpu().detach().numpy()
+    y_test_pred_binary = np.where(y_test_pred > 0.5, 1, 0)
+
+    # calculate confusion matrix
+    cm = confusion_matrix(y_test, y_test_pred_binary)
+
+    print(cm)
+
+    # calculate accuracy
+    accuracy = np.sum(np.diag(cm)) / np.sum(cm)
+    print(f"Accuracy: {accuracy}")
+
+    # print recall
+    recall = cm[1, 1] / (cm[1, 0] + cm[1, 1])
+    print(f"Recall: {recall}")
+
+    # print precision
+    precision = cm[1, 1] / (cm[0, 1] + cm[1, 1])
+    print(f"Precision: {precision}")
+
+    # print F1 score
+    f1 = 2 * (precision * recall) / (precision + recall)
+    print(f"F1 score: {f1}")
+
+    # print F2 score
+    f2 = 5 * (precision * recall) / (4 * precision + recall)
+    print(f"F2 score: {f2}")
+
+    # print AUC
+    auc = roc_auc_score(y_test, y_test_pred)
+
+    print(f"AUC: {auc}")
+
+    # plot confusion matrix using seaborn
+    sns.heatmap(cm, annot=True, fmt="d")
+    plt.xlabel("Predicted")
+    plt.ylabel("True")
+    plt.show()
+
+    return model
+
+
+def main():
+    X_train, y_train, X_test, y_test = prepare_data()
+    model = train_model(X_train, y_train, X_test, y_test)
+
+    # print parameter count of model
+    print(f"Parameter count: {sum(p.numel() for p in model.parameters())}")
+
+
+if __name__ == "__main__":
+    main()