Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
PBAD
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
LOUDNI Samir
PBAD
Commits
a6219051
Verified
Commit
a6219051
authored
2 years ago
by
BARBIER Marc
Browse files
Options
Downloads
Patches
Plain Diff
add flake8 lint
parent
0a3e35cf
No related branches found
No related tags found
No related merge requests found
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
.project
+11
-0
11 additions, 0 deletions
.project
requirements.txt
+3
-0
3 additions, 0 deletions
requirements.txt
src/main_TIPM.py
+81
-70
81 additions, 70 deletions
src/main_TIPM.py
src/tox.ini
+4
-0
4 additions, 0 deletions
src/tox.ini
tox.ini
+4
-0
4 additions, 0 deletions
tox.ini
with
103 additions
and
70 deletions
.project
+
11
−
0
View file @
a6219051
...
...
@@ -20,4 +20,15 @@
<nature>
org.eclipse.jdt.core.javanature
</nature>
<nature>
org.python.pydev.pythonNature
</nature>
</natures>
<filteredResources>
<filter>
<id>
1655190855508
</id>
<name></name>
<type>
30
</type>
<matcher>
<id>
org.eclipse.core.resources.regexFilterMatcher
</id>
<arguments>
node_modules|.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__
</arguments>
</matcher>
</filter>
</filteredResources>
</projectDescription>
This diff is collapsed.
Click to expand it.
requirements.txt
0 → 100644
+
3
−
0
View file @
a6219051
pandas
sklearn
cython
This diff is collapsed.
Click to expand it.
src/main_TIPM.py
+
81
−
70
View file @
a6219051
...
...
@@ -4,35 +4,37 @@ pattern-based anomaly detection
authors: Len Feremans
created: 8 May 2019
Used for integration with TIPM: A tool voor Interactive time series
Used for integration with TIPM: A tool voor Interactive time series
pattern mining and anomaly detection. (https://bitbucket.org/len_feremans/tipm_pub).
For TIPM we run PBAD_Embed commansd-line, which is PBAD without preprocessing and pattern mining first,
since this is done by this tools. PBAD_Embed computes weighted occurences and an isolation forest
"""
import
sys
,
os
import
sys
import
os
import
pandas
as
pd
import
numpy
as
np
from
methods.PBAD_Embed
import
PBAD_Embed
from
sklearn.metrics
import
roc_auc_score
,
average_precision_score
from
collections
import
defaultdict
#Convert nested list of windows to 2d numpy array
#Problem: if windows have different dimensions, np.array does not create matrix,
#but list of objects.
#Create matrix and pad windows with 0's if necessary
# Convert nested list of windows to 2d numpy array
# Problem: if windows have different dimensions, np.array does not create matrix,
# but list of objects.
# Create matrix and pad windows with 0's if necessary
def
windows2numpy
(
listOfWindows
):
normal_length
=
len
(
listOfWindows
[
len
(
listOfWindows
)
//
2
])
listOfWindows2
=
[]
;
listOfWindows2
=
[]
for
i
in
range
(
0
,
len
(
listOfWindows
)):
lst1
=
listOfWindows
[
i
]
lenLst1
=
len
(
lst1
)
lenLst1
=
len
(
lst1
)
if
lenLst1
!=
normal_length
:
if
lenLst1
>
normal_length
:
raise
Exception
(
"
Length is higher than expected
"
)
else
:
for
i
in
range
(
0
,
normal_length
-
lenLst1
):
lst1
.
append
(
0.0
)
for
idx
,
val
in
enumerate
(
lst1
):
#
bug in PBAD, called from TIPM, if empty values
for
idx
,
val
in
enumerate
(
lst1
):
#
bug in PBAD, called from TIPM, if empty values
if
val
==
'
?
'
:
lst1
[
idx
]
=
0.0
np_arr
=
np
.
array
(
lst1
).
astype
(
np
.
float64
)
...
...
@@ -40,14 +42,15 @@ def windows2numpy(listOfWindows):
np_arr
=
np
.
array
(
listOfWindows2
)
print
(
'
Debug: windows2numpy: type {}, type(arr[0]) {}, type(arr[0][0]) {} shape {}, arr[0] {}
'
.
format
(
type
(
np_arr
),
type
(
np_arr
[
0
]),
type
(
np_arr
[
0
][
0
]),
type
(
np_arr
[
0
]),
type
(
np_arr
[
0
][
0
]),
np_arr
.
shape
,
np_arr
[
i
][
0
]))
return
np_arr
if
__name__
==
'
__main__
'
:
#parse arguments
#
parse arguments
usage
=
"
main_TIPM -input CSVFILE -type all -columns pc1,pc2
\n
"
+
\
"
-itemset_fnames pc1_closed_item.txt,pc2_closed_item.txt
\n
"
+
\
"
-sequential_fnames pc1_closed_sp.txt,pc2_closed_sp.txt
\n
"
+
\
...
...
@@ -56,12 +59,13 @@ if __name__ == '__main__':
print
(
'
Argument List:
'
+
str
(
arguments
))
if
'
-?
'
in
arguments
:
print
(
usage
)
sys
.
exit
(
0
)
#normal end, for -? parameter
sys
.
exit
(
0
)
# normal end, for -? parameter
if
not
(
'
-type
'
in
arguments
and
'
-columns
'
in
arguments
and
'
-input
'
in
arguments
and
(
'
-itemset_fnames
'
in
arguments
or
'
-sequential_fnames
'
in
arguments
)):
and
(
'
-itemset_fnames
'
in
arguments
or
'
-sequential_fnames
'
in
arguments
)):
print
(
usage
)
sys
.
exit
(
-
1
)
def
get_argument
(
key
):
for
idx
,
arg
in
enumerate
(
arguments
):
if
arg
.
strip
().
lower
()
==
key
:
...
...
@@ -70,90 +74,93 @@ if __name__ == '__main__':
else
:
raise
Exception
(
"
Illegal last argument.
"
+
str
(
arguments
))
return
None
inputfilename
=
get_argument
(
'
-input
'
)
input_filename
=
get_argument
(
'
-input
'
)
pattern_type
=
get_argument
(
'
-type
'
)
columns
=
get_argument
(
'
-columns
'
).
lower
().
split
(
'
,
'
)
itemset_fnames
=
get_argument
(
'
-itemset_fnames
'
)
sequential_fnames
=
get_argument
(
'
-sequential_fnames
'
)
score_fname
=
get_argument
(
'
-score_fname
'
)
#Validation command-line arguments
#
Validation command-line arguments
# 1) Type is either all, itemset, sequential
# 2) Depending on type we expect either an file with either itemsets and/or sequential pattern for each column
if
not
pattern_type
in
[
'
all
'
,
'
itemset
'
,
'
sequential
'
]:
print
(
'
Type not in
'
+
str
([
'
all
'
,
'
itemset
'
,
'
sequential
'
]))
;
# 2) Depending on type we expect either an file with either itemsets and/or sequential pattern for each column
if
pattern_type
not
in
[
'
all
'
,
'
itemset
'
,
'
sequential
'
]:
print
(
'
Type not in
'
+
str
([
'
all
'
,
'
itemset
'
,
'
sequential
'
]))
print
(
usage
)
sys
.
exit
(
-
1
)
if
not
os
.
path
.
isfile
(
inputfilename
):
print
(
'
input does not exist
'
)
if
not
os
.
path
.
isfile
(
input
_
filename
):
print
(
'
input does not exist
'
)
print
(
usage
)
sys
.
exit
(
-
1
)
if
(
pattern_type
==
'
all
'
or
pattern_type
==
'
itemset
'
)
and
itemset_fnames
==
None
:
print
(
'
Specify -itemset_fnames
'
)
if
(
pattern_type
==
'
all
'
or
pattern_type
==
'
itemset
'
)
and
itemset_fnames
is
None
:
print
(
'
Specify -itemset_fnames
'
)
print
(
usage
)
sys
.
exit
(
-
1
)
if
(
pattern_type
==
'
all
'
or
pattern_type
==
'
sequential
'
)
and
sequential_fnames
==
None
:
print
(
'
Specify -sequential_fnames
'
)
if
(
pattern_type
==
'
all
'
or
pattern_type
==
'
sequential
'
)
and
sequential_fnames
is
None
:
print
(
'
Specify -sequential_fnames
'
)
print
(
usage
)
sys
.
exit
(
-
1
)
for
fnames
in
[
itemset_fnames
,
sequential_fnames
]:
if
fnames
!=
None
:
if
fnames
is
not
None
:
for
idx
,
fname
in
enumerate
(
fnames
.
split
(
'
,
'
)):
if
not
os
.
path
.
isfile
(
fname
):
print
(
'
pattern input does not exist
'
+
fname
)
print
(
usage
)
sys
.
exit
(
-
1
)
print
(
'
pattern input does not exist
'
+
fname
)
print
(
usage
)
sys
.
exit
(
-
1
)
else
:
f
=
open
(
fname
,
'
r
'
)
l1
=
f
.
readline
().
lower
().
split
(
'
,
'
)
l2
=
f
.
readline
().
lower
().
split
(
'
,
'
)
print
(
str
(
idx
)
+
'
: Reading patterns
'
+
fname
+
'
for testing
\n
'
+
str
(
l1
)
+
'
\n
'
+
str
(
l2
))
#print(' Associate column: ' + columns[idx])
f
.
close
()
#Validation CSV file
#
print(' Associate column: ' + columns[idx])
f
.
close
()
#
Validation CSV file
# Assumes CSV file has following structure:
# 1) First column is timestamp/time step
# 2) Label column is named "label"
# 3) Window column is named "window"
# 4) For each continous time series with name X, the corresponding columns has name X_D
# 5) Patternset are 1 dimensional
f
=
open
(
inputfilename
,
'
r
'
)
f
=
open
(
input
_
filename
,
'
r
'
)
columns_csv
=
f
.
readline
().
lower
().
strip
().
split
(
'
,
'
)
f
.
close
()
f
.
close
()
print
(
'
Reading CSVFile
'
+
str
(
columns_csv
))
if
not
'
window
'
in
columns_csv
:
if
'
window
'
not
in
columns_csv
:
print
(
'
Expecting column window
'
)
sys
.
exit
(
-
1
)
if
not
'
label
'
in
columns_csv
:
sys
.
exit
(
-
1
)
if
'
label
'
not
in
columns_csv
:
print
(
'
Expecting column label
'
)
sys
.
exit
(
-
1
)
#If discrete column names are pased, fix this
columns
=
[
col
if
not
col
.
endswith
(
'
_d
'
)
else
col
[
0
:
len
(
col
)
-
2
]
for
col
in
columns
]
sys
.
exit
(
-
1
)
# If discrete column names are pased, fix this
columns
=
[
col
if
not
col
.
endswith
(
'
_d
'
)
else
col
[
0
:
len
(
col
)
-
2
]
for
col
in
columns
]
for
col
in
columns
:
if
not
col
in
columns_csv
:
if
col
not
in
columns_csv
:
print
(
'
Expecting time series column
'
+
col
)
sys
.
exit
(
-
1
)
if
not
col
+
'
_d
'
in
columns_csv
:
sys
.
exit
(
-
1
)
if
col
+
'
_d
'
not
in
columns_csv
:
print
(
'
Excepting time series discretized column with name
'
+
col
+
'
_d
'
)
sys
.
exit
(
-
1
)
#RUN
#preprocess: create windows for each continuous column, i.e. group by window column in TIPM
sys
.
exit
(
-
1
)
#
RUN
#
preprocess: create windows for each continuous column, i.e. group by window column in TIPM
# for labels create either 1 (anomaly) if 1 is in window, or -1 (good) if -1 in window and not 1, else 0
#Note: Doing this in plain-old python, instead of using more efficient numpy stuff
df
=
pd
.
read_csv
(
inputfilename
,
header
=
0
,
index_col
=
0
)
#
Note: Doing this in plain-old python, instead of using more efficient numpy stuff
df
=
pd
.
read_csv
(
input
_
filename
,
header
=
0
,
index_col
=
0
)
cols
=
[
c
.
lower
().
strip
()
for
c
in
list
(
df
.
columns
.
values
)]
rows
=
df
.
values
.
tolist
()
rows
=
df
.
values
.
tolist
()
windowIdx
=
cols
.
index
(
"
window
"
)
labelIdx
=
cols
.
index
(
"
label
"
)
columnsIdx
=
[
cols
.
index
(
col
)
for
col
in
columns
]
discrete_columnsIdx
=
[
cols
.
index
(
col
+
'
_d
'
)
for
col
in
columns
]
discrete_columnsIdx
=
[
cols
.
index
(
col
+
'
_d
'
)
for
col
in
columns
]
group_by_window
=
defaultdict
(
list
)
current_window
=
None
windows
=
list
()
for
row
in
rows
:
window
=
row
[
windowIdx
]
if
not
window
in
windows
:
if
window
not
in
windows
:
windows
.
append
(
window
)
group_by_window
[
window
].
append
(
row
)
windowed_labels
=
[]
...
...
@@ -180,36 +187,40 @@ if __name__ == '__main__':
windowed_series
[
i
].
append
(
series
[
i
])
for
i
in
range
(
0
,
len
(
discrete_columnsIdx
)):
windowed_series_discrete
[
i
].
append
(
discrete_series
[
i
])
#transform to datastructures for PBAD
window_labels
=
np
.
array
(
windowed_labels
)
#
transform to datastructures for PBAD
window_labels
=
np
.
array
(
windowed_labels
)
continuous_data
=
{}
continuous_data_discretized
=
{}
continuous_data_discretized
=
{}
for
i
in
range
(
0
,
len
(
columnsIdx
)):
continuous_data
[
i
]
=
windows2numpy
(
windowed_series
[
i
])
continuous_data_discretized
[
i
]
=
windows2numpy
(
windowed_series_discrete
[
i
])
#cont_series = {0: data.iloc[:, 0].values}
#labels = data.iloc[:, 1].values
#cd_D, cd_UD, _, window_labels = preprocess(cont_series, labels=labels)
#
cont_series = {0: data.iloc[:, 0].values}
#
labels = data.iloc[:, 1].values
#
cd_D, cd_UD, _, window_labels = preprocess(cont_series, labels=labels)
# run PBAD, sequential_fnames]:
print
(
'
\n
Running PBAD Embed: This computes embedding of patterns, that is a weighted occurrences score for each pattern and each window,
'
+
\
print
(
'
\n
Running PBAD Embed: This computes embedding of patterns, that is a weighted occurrences score for each pattern and each window,
'
+
'
and than compute an anomaly score using isolation forests. Patternsets must be provided.
'
)
if
itemset_fnames
!=
None
:
if
itemset_fnames
is
not
None
:
itemset_fnames
=
itemset_fnames
.
split
(
'
,
'
)
if
sequential_fnames
!=
None
:
if
sequential_fnames
is
not
None
:
sequential_fnames
=
sequential_fnames
.
split
(
'
,
'
)
detector
=
PBAD_Embed
(
pattern_type
=
pattern_type
,
itemset_filenames_cont
=
itemset_fnames
,
sp_filenames_cont
=
sequential_fnames
)
scores
=
detector
.
fit_predict
(
continuous_data_discretized
,
continuous_data
)
ixl
=
np
.
where
(
window_labels
!=
0
)[
0
]
auc
=
roc_auc_score
(
y_true
=
window_labels
[
ixl
],
y_score
=
scores
[
ixl
])
ap
=
average_precision_score
(
y_true
=
window_labels
[
ixl
],
y_score
=
scores
[
ixl
])
print
(
"
AUC: {:.3f}
"
.
format
(
auc
))
print
(
"
AP: {:.3f}
"
.
format
(
ap
))
#save score
if
score_fname
!=
None
:
print
(
"
AP: {:.3f}
"
.
format
(
ap
))
# save score
if
score_fname
is
not
None
:
f
=
open
(
score_fname
,
'
w
'
)
f
.
write
(
"
Window,Score
\n
"
)
f
.
write
(
"
Window,Score
\n
"
)
for
idx
,
win
in
enumerate
(
windows
):
score
=
scores
[
idx
]
f
.
write
(
"
{},{:.6f}
\n
"
.
format
(
win
,
score
))
f
.
close
()
print
(
"
Saved {}
"
.
format
(
score_fname
))
\ No newline at end of file
f
.
write
(
"
{},{:.6f}
\n
"
.
format
(
win
,
score
))
f
.
close
()
print
(
"
Saved {}
"
.
format
(
score_fname
))
This diff is collapsed.
Click to expand it.
src/tox.ini
0 → 100644
+
4
−
0
View file @
a6219051
[flake8]
max-line-length
=
150
exclude
=
.git main.py
docstring-convention
=
numpy
This diff is collapsed.
Click to expand it.
tox.ini
0 → 100644
+
4
−
0
View file @
a6219051
[flake8]
max-line-length
=
150
exclude
=
.git
docstring-convention
=
numpy
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment