Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
ML_Operator_Decision
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
EYHORN Konstantin
ML_Operator_Decision
Commits
76186a88
Commit
76186a88
authored
1 year ago
by
Konstantin Gerd Eyhorn
Browse files
Options
Downloads
Patches
Plain Diff
balancing train data and more evaluation metrics
parent
deaa32b4
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
mlp_train.py
+94
-20
94 additions, 20 deletions
mlp_train.py
with
94 additions
and
20 deletions
mlp_train.py
+
94
−
20
View file @
76186a88
...
...
@@ -4,32 +4,66 @@ import numpy as np
import
torch
from
tqdm
import
trange
,
tqdm
from
sklearn.model_selection
import
train_test_split
import
time
import
matplotlib.pyplot
as
plt
from
sklearn.metrics
import
confusion_matrix
import
seaborn
as
sns
from
sklearn.metrics
import
roc_auc_score
# S_features_filter = [abs_S_Smin,rel_S_Smin_semi_width,rel_S_Smin_full_width,abs_S_Smax,rel_S_Smax_semi_width,rel_S_Smax_full_width,count_anomalies_S,ratio_anomalies_S,max_variation_S]
# T_features_filter = [abs_T_Tmin,rel_T_Tmin_semi_width,rel_T_Tmin_full_width,abs_T_Tmax,rel_T_Tmax_semi_width,rel_T_Tmax_full_width,count_anomalies_T,ratio_anomalies_T,max_variation_T]
# B_features_filter = [mean_correlation,nb_measurements]
S_features_filter
=
[
"
abs_S_Smin
"
,
"
rel_S_Smin_semi_width
"
,
"
rel_S_Smin_full_width
"
,
"
abs_S_Smax
"
,
"
rel_S_Smax_semi_width
"
,
"
rel_S_Smax_full_width
"
,
"
count_anomalies_S
"
,
"
ratio_anomalies_S
"
,
"
max_variation_S
"
,
]
T_features_filter
=
[
"
abs_T_Tmin
"
,
"
rel_T_Tmin_semi_width
"
,
"
rel_T_Tmin_full_width
"
,
"
abs_T_Tmax
"
,
"
rel_T_Tmax_semi_width
"
,
"
rel_T_Tmax_full_width
"
,
"
count_anomalies_T
"
,
"
ratio_anomalies_T
"
,
"
max_variation_T
"
,
]
B_features_filter
=
[
"
mean_correlation
"
,
"
nb_measurements
"
]
PICKLE_PATH
=
"
dataset_pandas/temperature.pkl
"
##### HYPERPARAMETERS #####
EPOCHS
=
300
BATCH_SIZE
=
16
BATCH_SIZE
=
32
CRITERION
=
nn
.
BCELoss
()
OPTIMIZER
=
torch
.
optim
.
Adam
LEARNING_RATE
=
0.01
GROWTH_RATE
=
16
DROP_RATE
=
0.5
SCHEDULER_PATIENCE
=
2
0
SCHEDULER_PATIENCE
=
1
0
SCHEDULER_FACTOR
=
0.5
SCHEDULER_EPS
=
1e-8
input_features
=
11
class
MLP
(
nn
.
Module
):
def
__init__
(
self
):
super
(
MLP
,
self
).
__init__
()
self
.
block1
=
nn
.
Sequential
(
nn
.
Linear
(
20
,
GROWTH_RATE
),
nn
.
BatchNorm1d
(
GROWTH_RATE
),
nn
.
ReLU
()
nn
.
Linear
(
input_features
,
GROWTH_RATE
),
nn
.
BatchNorm1d
(
GROWTH_RATE
),
nn
.
ReLU
(),
)
self
.
block2
=
nn
.
Sequential
(
nn
.
Linear
(
GROWTH_RATE
,
GROWTH_RATE
),
nn
.
BatchNorm1d
(
GROWTH_RATE
),
nn
.
ReLU
()
...
...
@@ -68,25 +102,42 @@ class MLP(nn.Module):
return
y
def
prepare_data
()
->
tuple
[
np
.
ndarray
,
np
.
ndarray
]:
def
prepare_data
()
->
tuple
[
np
.
ndarray
,
np
.
ndarray
,
np
.
ndarray
,
np
.
ndarray
]:
# Load data
df
=
pd
.
read_pickle
(
PICKLE_PATH
)
X
=
df
.
drop
(
columns
=
[
"
alarm
"
]).
to_numpy
()
y
=
df
[
"
alarm
"
].
to_numpy
(
)
# drop Salinity Features
df
=
df
.
drop
(
columns
=
S_features_filter
)
assert
X
.
shape
[
1
]
==
20
,
"
Number of features should be 20
"
assert
y
.
shape
[
0
]
==
X
.
shape
[
0
],
"
Number of labels should match number of samples
"
# split the into training and testing sets
train_df
,
test_df
=
train_test_split
(
df
,
test_size
=
0.2
,
random_state
=
123456789
)
return
X
,
y
print
(
f
"
Train alarm distribution (befor undersampling):
{
train_df
[
'
alarm
'
].
value_counts
()
}
"
)
# balance the training set
min_alarm
=
train_df
[
"
alarm
"
].
value_counts
().
min
()
train_df
=
pd
.
concat
(
[
train_df
[
train_df
[
"
alarm
"
]
==
0
].
sample
(
min_alarm
),
train_df
[
train_df
[
"
alarm
"
]
==
1
].
sample
(
min_alarm
),
]
)
print
(
f
"
Train alarm distribution:
{
train_df
[
'
alarm
'
].
value_counts
()
}
"
)
def
train_model
(
X
:
np
.
ndarray
,
y
:
np
.
ndarray
):
X_train
=
train_df
.
drop
(
columns
=
[
"
alarm
"
]).
values
y_train
=
train_df
[
"
alarm
"
].
values
device
=
torch
.
device
(
"
cuda
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
)
X_test
=
test_df
.
drop
(
columns
=
[
"
alarm
"
]).
values
y_test
=
test_df
[
"
alarm
"
].
values
# Splitting the data into training and testing sets
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.2
)
return
X_train
,
y_train
,
X_test
,
y_test
def
train_model
(
X_train
,
y_train
,
X_test
,
y_test
):
device
=
torch
.
device
(
"
cuda
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
)
# Setting up the data loader
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
...
...
@@ -191,22 +242,45 @@ def train_model(X: np.ndarray, y: np.ndarray):
# calculate confusion matrix
cm
=
confusion_matrix
(
y_test
,
y_test_pred_binary
)
print
(
cm
)
# calculate accuracy
accuracy
=
np
.
sum
(
np
.
diag
(
cm
))
/
np
.
sum
(
cm
)
print
(
f
"
Accuracy:
{
accuracy
}
"
)
# print recall
recall
=
cm
[
1
,
1
]
/
(
cm
[
1
,
0
]
+
cm
[
1
,
1
])
print
(
f
"
Recall:
{
recall
}
"
)
# print precision
precision
=
cm
[
1
,
1
]
/
(
cm
[
0
,
1
]
+
cm
[
1
,
1
])
print
(
f
"
Precision:
{
precision
}
"
)
# print F1 score
f1
=
2
*
(
precision
*
recall
)
/
(
precision
+
recall
)
print
(
f
"
F1 score:
{
f1
}
"
)
# print F2 score
f2
=
5
*
(
precision
*
recall
)
/
(
4
*
precision
+
recall
)
print
(
f
"
F2 score:
{
f2
}
"
)
# print AUC
auc
=
roc_auc_score
(
y_test
,
y_test_pred
)
print
(
f
"
AUC:
{
auc
}
"
)
# plot confusion matrix using seaborn
sns
.
heatmap
(
cm
,
annot
=
True
,
fmt
=
"
d
"
)
plt
.
xlabel
(
"
Predicted
"
)
plt
.
ylabel
(
"
True
"
)
plt
.
show
()
# calculate accuracy
accuracy
=
np
.
sum
(
np
.
diag
(
cm
))
/
np
.
sum
(
cm
)
print
(
f
"
Accuracy:
{
accuracy
}
"
)
return
model
def
main
():
X
,
y
=
prepare_data
()
model
=
train_model
(
X
,
y
)
X
_train
,
y_train
,
X_test
,
y_test
=
prepare_data
()
model
=
train_model
(
X
_train
,
y_train
,
X_test
,
y_test
)
# print parameter count of model
print
(
f
"
Parameter count:
{
sum
(
p
.
numel
()
for
p
in
model
.
parameters
())
}
"
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment