Skip to content
Snippets Groups Projects
Commit 76186a88 authored by Konstantin Gerd Eyhorn's avatar Konstantin Gerd Eyhorn
Browse files

balancing train data and more evaluation metrics

parent deaa32b4
No related branches found
No related tags found
No related merge requests found
......@@ -4,32 +4,66 @@ import numpy as np
import torch
from tqdm import trange, tqdm
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import roc_auc_score
# S_features_filter = [abs_S_Smin,rel_S_Smin_semi_width,rel_S_Smin_full_width,abs_S_Smax,rel_S_Smax_semi_width,rel_S_Smax_full_width,count_anomalies_S,ratio_anomalies_S,max_variation_S]
# T_features_filter = [abs_T_Tmin,rel_T_Tmin_semi_width,rel_T_Tmin_full_width,abs_T_Tmax,rel_T_Tmax_semi_width,rel_T_Tmax_full_width,count_anomalies_T,ratio_anomalies_T,max_variation_T]
# B_features_filter = [mean_correlation,nb_measurements]
S_features_filter = [
"abs_S_Smin",
"rel_S_Smin_semi_width",
"rel_S_Smin_full_width",
"abs_S_Smax",
"rel_S_Smax_semi_width",
"rel_S_Smax_full_width",
"count_anomalies_S",
"ratio_anomalies_S",
"max_variation_S",
]
T_features_filter = [
"abs_T_Tmin",
"rel_T_Tmin_semi_width",
"rel_T_Tmin_full_width",
"abs_T_Tmax",
"rel_T_Tmax_semi_width",
"rel_T_Tmax_full_width",
"count_anomalies_T",
"ratio_anomalies_T",
"max_variation_T",
]
B_features_filter = ["mean_correlation", "nb_measurements"]
PICKLE_PATH = "dataset_pandas/temperature.pkl"
##### HYPERPARAMETERS #####
EPOCHS = 300
BATCH_SIZE = 16
BATCH_SIZE = 32
CRITERION = nn.BCELoss()
OPTIMIZER = torch.optim.Adam
LEARNING_RATE = 0.01
GROWTH_RATE = 16
DROP_RATE = 0.5
SCHEDULER_PATIENCE = 20
SCHEDULER_PATIENCE = 10
SCHEDULER_FACTOR = 0.5
SCHEDULER_EPS = 1e-8
input_features = 11
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.block1 = nn.Sequential(
nn.Linear(20, GROWTH_RATE), nn.BatchNorm1d(GROWTH_RATE), nn.ReLU()
nn.Linear(input_features, GROWTH_RATE),
nn.BatchNorm1d(GROWTH_RATE),
nn.ReLU(),
)
self.block2 = nn.Sequential(
nn.Linear(GROWTH_RATE, GROWTH_RATE), nn.BatchNorm1d(GROWTH_RATE), nn.ReLU()
......@@ -68,25 +102,42 @@ class MLP(nn.Module):
return y
def prepare_data() -> tuple[np.ndarray, np.ndarray]:
def prepare_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
# Load data
df = pd.read_pickle(PICKLE_PATH)
X = df.drop(columns=["alarm"]).to_numpy()
y = df["alarm"].to_numpy()
# drop Salinity Features
df = df.drop(columns=S_features_filter)
assert X.shape[1] == 20, "Number of features should be 20"
assert y.shape[0] == X.shape[0], "Number of labels should match number of samples"
# split the into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123456789)
return X, y
print(
f"Train alarm distribution (befor undersampling): {train_df['alarm'].value_counts()}"
)
# balance the training set
min_alarm = train_df["alarm"].value_counts().min()
train_df = pd.concat(
[
train_df[train_df["alarm"] == 0].sample(min_alarm),
train_df[train_df["alarm"] == 1].sample(min_alarm),
]
)
print(f"Train alarm distribution: {train_df['alarm'].value_counts()}")
def train_model(X: np.ndarray, y: np.ndarray):
X_train = train_df.drop(columns=["alarm"]).values
y_train = train_df["alarm"].values
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_test = test_df.drop(columns=["alarm"]).values
y_test = test_df["alarm"].values
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
return X_train, y_train, X_test, y_test
def train_model(X_train, y_train, X_test, y_test):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Setting up the data loader
train_loader = torch.utils.data.DataLoader(
......@@ -191,22 +242,45 @@ def train_model(X: np.ndarray, y: np.ndarray):
# calculate confusion matrix
cm = confusion_matrix(y_test, y_test_pred_binary)
print(cm)
# calculate accuracy
accuracy = np.sum(np.diag(cm)) / np.sum(cm)
print(f"Accuracy: {accuracy}")
# print recall
recall = cm[1, 1] / (cm[1, 0] + cm[1, 1])
print(f"Recall: {recall}")
# print precision
precision = cm[1, 1] / (cm[0, 1] + cm[1, 1])
print(f"Precision: {precision}")
# print F1 score
f1 = 2 * (precision * recall) / (precision + recall)
print(f"F1 score: {f1}")
# print F2 score
f2 = 5 * (precision * recall) / (4 * precision + recall)
print(f"F2 score: {f2}")
# print AUC
auc = roc_auc_score(y_test, y_test_pred)
print(f"AUC: {auc}")
# plot confusion matrix using seaborn
sns.heatmap(cm, annot=True, fmt="d")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()
# calculate accuracy
accuracy = np.sum(np.diag(cm)) / np.sum(cm)
print(f"Accuracy: {accuracy}")
return model
def main():
X, y = prepare_data()
model = train_model(X, y)
X_train, y_train, X_test, y_test = prepare_data()
model = train_model(X_train, y_train, X_test, y_test)
# print parameter count of model
print(f"Parameter count: {sum(p.numel() for p in model.parameters())}")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment