Skip to content
Snippets Groups Projects
Commit 76186a88 authored by Konstantin Gerd Eyhorn's avatar Konstantin Gerd Eyhorn
Browse files

balancing train data and more evaluation metrics

parent deaa32b4
No related branches found
No related tags found
No related merge requests found
...@@ -4,32 +4,66 @@ import numpy as np ...@@ -4,32 +4,66 @@ import numpy as np
import torch import torch
from tqdm import trange, tqdm from tqdm import trange, tqdm
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
import seaborn as sns import seaborn as sns
from sklearn.metrics import roc_auc_score
# S_features_filter = [abs_S_Smin,rel_S_Smin_semi_width,rel_S_Smin_full_width,abs_S_Smax,rel_S_Smax_semi_width,rel_S_Smax_full_width,count_anomalies_S,ratio_anomalies_S,max_variation_S]
# T_features_filter = [abs_T_Tmin,rel_T_Tmin_semi_width,rel_T_Tmin_full_width,abs_T_Tmax,rel_T_Tmax_semi_width,rel_T_Tmax_full_width,count_anomalies_T,ratio_anomalies_T,max_variation_T]
# B_features_filter = [mean_correlation,nb_measurements]
S_features_filter = [
"abs_S_Smin",
"rel_S_Smin_semi_width",
"rel_S_Smin_full_width",
"abs_S_Smax",
"rel_S_Smax_semi_width",
"rel_S_Smax_full_width",
"count_anomalies_S",
"ratio_anomalies_S",
"max_variation_S",
]
T_features_filter = [
"abs_T_Tmin",
"rel_T_Tmin_semi_width",
"rel_T_Tmin_full_width",
"abs_T_Tmax",
"rel_T_Tmax_semi_width",
"rel_T_Tmax_full_width",
"count_anomalies_T",
"ratio_anomalies_T",
"max_variation_T",
]
B_features_filter = ["mean_correlation", "nb_measurements"]
PICKLE_PATH = "dataset_pandas/temperature.pkl" PICKLE_PATH = "dataset_pandas/temperature.pkl"
##### HYPERPARAMETERS ##### ##### HYPERPARAMETERS #####
EPOCHS = 300 EPOCHS = 300
BATCH_SIZE = 16 BATCH_SIZE = 32
CRITERION = nn.BCELoss() CRITERION = nn.BCELoss()
OPTIMIZER = torch.optim.Adam OPTIMIZER = torch.optim.Adam
LEARNING_RATE = 0.01 LEARNING_RATE = 0.01
GROWTH_RATE = 16 GROWTH_RATE = 16
DROP_RATE = 0.5 DROP_RATE = 0.5
SCHEDULER_PATIENCE = 20 SCHEDULER_PATIENCE = 10
SCHEDULER_FACTOR = 0.5 SCHEDULER_FACTOR = 0.5
SCHEDULER_EPS = 1e-8 SCHEDULER_EPS = 1e-8
input_features = 11
class MLP(nn.Module): class MLP(nn.Module):
def __init__(self): def __init__(self):
super(MLP, self).__init__() super(MLP, self).__init__()
self.block1 = nn.Sequential( self.block1 = nn.Sequential(
nn.Linear(20, GROWTH_RATE), nn.BatchNorm1d(GROWTH_RATE), nn.ReLU() nn.Linear(input_features, GROWTH_RATE),
nn.BatchNorm1d(GROWTH_RATE),
nn.ReLU(),
) )
self.block2 = nn.Sequential( self.block2 = nn.Sequential(
nn.Linear(GROWTH_RATE, GROWTH_RATE), nn.BatchNorm1d(GROWTH_RATE), nn.ReLU() nn.Linear(GROWTH_RATE, GROWTH_RATE), nn.BatchNorm1d(GROWTH_RATE), nn.ReLU()
...@@ -68,25 +102,42 @@ class MLP(nn.Module): ...@@ -68,25 +102,42 @@ class MLP(nn.Module):
return y return y
def prepare_data() -> tuple[np.ndarray, np.ndarray]: def prepare_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
# Load data # Load data
df = pd.read_pickle(PICKLE_PATH) df = pd.read_pickle(PICKLE_PATH)
X = df.drop(columns=["alarm"]).to_numpy() # drop Salinity Features
y = df["alarm"].to_numpy() df = df.drop(columns=S_features_filter)
assert X.shape[1] == 20, "Number of features should be 20" # split the into training and testing sets
assert y.shape[0] == X.shape[0], "Number of labels should match number of samples" train_df, test_df = train_test_split(df, test_size=0.2, random_state=123456789)
return X, y print(
f"Train alarm distribution (befor undersampling): {train_df['alarm'].value_counts()}"
)
# balance the training set
min_alarm = train_df["alarm"].value_counts().min()
train_df = pd.concat(
[
train_df[train_df["alarm"] == 0].sample(min_alarm),
train_df[train_df["alarm"] == 1].sample(min_alarm),
]
)
print(f"Train alarm distribution: {train_df['alarm'].value_counts()}")
def train_model(X: np.ndarray, y: np.ndarray): X_train = train_df.drop(columns=["alarm"]).values
y_train = train_df["alarm"].values
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") X_test = test_df.drop(columns=["alarm"]).values
y_test = test_df["alarm"].values
# Splitting the data into training and testing sets return X_train, y_train, X_test, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
def train_model(X_train, y_train, X_test, y_test):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Setting up the data loader # Setting up the data loader
train_loader = torch.utils.data.DataLoader( train_loader = torch.utils.data.DataLoader(
...@@ -191,22 +242,45 @@ def train_model(X: np.ndarray, y: np.ndarray): ...@@ -191,22 +242,45 @@ def train_model(X: np.ndarray, y: np.ndarray):
# calculate confusion matrix # calculate confusion matrix
cm = confusion_matrix(y_test, y_test_pred_binary) cm = confusion_matrix(y_test, y_test_pred_binary)
print(cm)
# calculate accuracy
accuracy = np.sum(np.diag(cm)) / np.sum(cm)
print(f"Accuracy: {accuracy}")
# print recall
recall = cm[1, 1] / (cm[1, 0] + cm[1, 1])
print(f"Recall: {recall}")
# print precision
precision = cm[1, 1] / (cm[0, 1] + cm[1, 1])
print(f"Precision: {precision}")
# print F1 score
f1 = 2 * (precision * recall) / (precision + recall)
print(f"F1 score: {f1}")
# print F2 score
f2 = 5 * (precision * recall) / (4 * precision + recall)
print(f"F2 score: {f2}")
# print AUC
auc = roc_auc_score(y_test, y_test_pred)
print(f"AUC: {auc}")
# plot confusion matrix using seaborn # plot confusion matrix using seaborn
sns.heatmap(cm, annot=True, fmt="d") sns.heatmap(cm, annot=True, fmt="d")
plt.xlabel("Predicted") plt.xlabel("Predicted")
plt.ylabel("True") plt.ylabel("True")
plt.show() plt.show()
# calculate accuracy
accuracy = np.sum(np.diag(cm)) / np.sum(cm)
print(f"Accuracy: {accuracy}")
return model return model
def main(): def main():
X, y = prepare_data() X_train, y_train, X_test, y_test = prepare_data()
model = train_model(X, y) model = train_model(X_train, y_train, X_test, y_test)
# print parameter count of model # print parameter count of model
print(f"Parameter count: {sum(p.numel() for p in model.parameters())}") print(f"Parameter count: {sum(p.numel() for p in model.parameters())}")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment