Skip to content
Snippets Groups Projects
Verified Commit a6219051 authored by BARBIER Marc's avatar BARBIER Marc
Browse files

add flake8 lint

parent 0a3e35cf
No related branches found
No related tags found
No related merge requests found
......@@ -20,4 +20,15 @@
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.python.pydev.pythonNature</nature>
</natures>
<filteredResources>
<filter>
<id>1655190855508</id>
<name></name>
<type>30</type>
<matcher>
<id>org.eclipse.core.resources.regexFilterMatcher</id>
<arguments>node_modules|.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__</arguments>
</matcher>
</filter>
</filteredResources>
</projectDescription>
......@@ -4,35 +4,37 @@ pattern-based anomaly detection
authors: Len Feremans
created: 8 May 2019
Used for integration with TIPM: A tool voor Interactive time series
Used for integration with TIPM: A tool voor Interactive time series
pattern mining and anomaly detection. (https://bitbucket.org/len_feremans/tipm_pub).
For TIPM we run PBAD_Embed commansd-line, which is PBAD without preprocessing and pattern mining first,
since this is done by this tools. PBAD_Embed computes weighted occurences and an isolation forest
"""
import sys, os
import sys
import os
import pandas as pd
import numpy as np
from methods.PBAD_Embed import PBAD_Embed
from sklearn.metrics import roc_auc_score, average_precision_score
from collections import defaultdict
#Convert nested list of windows to 2d numpy array
#Problem: if windows have different dimensions, np.array does not create matrix,
#but list of objects.
#Create matrix and pad windows with 0's if necessary
# Convert nested list of windows to 2d numpy array
# Problem: if windows have different dimensions, np.array does not create matrix,
# but list of objects.
# Create matrix and pad windows with 0's if necessary
def windows2numpy(listOfWindows):
normal_length = len(listOfWindows[len(listOfWindows)//2])
listOfWindows2 = [];
listOfWindows2 = []
for i in range(0, len(listOfWindows)):
lst1 = listOfWindows[i]
lenLst1 = len(lst1)
lenLst1 = len(lst1)
if lenLst1 != normal_length:
if lenLst1 > normal_length:
raise Exception("Length is higher than expected")
else:
for i in range(0, normal_length - lenLst1):
lst1.append(0.0)
for idx, val in enumerate(lst1): #bug in PBAD, called from TIPM, if empty values
for idx, val in enumerate(lst1): # bug in PBAD, called from TIPM, if empty values
if val == '?':
lst1[idx] = 0.0
np_arr = np.array(lst1).astype(np.float64)
......@@ -40,14 +42,15 @@ def windows2numpy(listOfWindows):
np_arr = np.array(listOfWindows2)
print('Debug: windows2numpy: type {}, type(arr[0]) {}, type(arr[0][0]) {} shape {}, arr[0] {}'.format(
type(np_arr),
type(np_arr[0]),
type(np_arr[0][0]),
type(np_arr[0]),
type(np_arr[0][0]),
np_arr.shape,
np_arr[i][0]))
return np_arr
if __name__ == '__main__':
#parse arguments
# parse arguments
usage = "main_TIPM -input CSVFILE -type all -columns pc1,pc2\n" + \
"-itemset_fnames pc1_closed_item.txt,pc2_closed_item.txt\n" + \
"-sequential_fnames pc1_closed_sp.txt,pc2_closed_sp.txt\n" + \
......@@ -56,12 +59,13 @@ if __name__ == '__main__':
print('Argument List:' + str(arguments))
if '-?' in arguments:
print(usage)
sys.exit(0) #normal end, for -? parameter
sys.exit(0) # normal end, for -? parameter
if not('-type' in arguments and '-columns' in arguments and '-input' in arguments
and ('-itemset_fnames' in arguments or '-sequential_fnames' in arguments)):
and ('-itemset_fnames' in arguments or '-sequential_fnames' in arguments)):
print(usage)
sys.exit(-1)
def get_argument(key):
for idx, arg in enumerate(arguments):
if arg.strip().lower() == key:
......@@ -70,90 +74,93 @@ if __name__ == '__main__':
else:
raise Exception("Illegal last argument. " + str(arguments))
return None
inputfilename = get_argument('-input')
input_filename = get_argument('-input')
pattern_type = get_argument('-type')
columns = get_argument('-columns').lower().split(',')
itemset_fnames = get_argument('-itemset_fnames')
sequential_fnames = get_argument('-sequential_fnames')
score_fname = get_argument('-score_fname')
#Validation command-line arguments
# Validation command-line arguments
# 1) Type is either all, itemset, sequential
# 2) Depending on type we expect either an file with either itemsets and/or sequential pattern for each column
if not pattern_type in ['all', 'itemset', 'sequential']:
print('Type not in ' + str(['all', 'itemset', 'sequential']));
# 2) Depending on type we expect either an file with either itemsets and/or sequential pattern for each column
if pattern_type not in ['all', 'itemset', 'sequential']:
print('Type not in ' + str(['all', 'itemset', 'sequential']))
print(usage)
sys.exit(-1)
if not os.path.isfile(inputfilename):
print('input does not exist')
if not os.path.isfile(input_filename):
print('input does not exist')
print(usage)
sys.exit(-1)
if (pattern_type == 'all' or pattern_type=='itemset') and itemset_fnames == None:
print('Specify -itemset_fnames')
if (pattern_type == 'all' or pattern_type == 'itemset') and itemset_fnames is None:
print('Specify -itemset_fnames')
print(usage)
sys.exit(-1)
if (pattern_type == 'all' or pattern_type=='sequential') and sequential_fnames == None:
print('Specify -sequential_fnames')
if (pattern_type == 'all' or pattern_type == 'sequential') and sequential_fnames is None:
print('Specify -sequential_fnames')
print(usage)
sys.exit(-1)
for fnames in [itemset_fnames, sequential_fnames]:
if fnames != None:
if fnames is not None:
for idx, fname in enumerate(fnames.split(',')):
if not os.path.isfile(fname):
print('pattern input does not exist ' + fname)
print(usage)
sys.exit(-1)
print('pattern input does not exist ' + fname)
print(usage)
sys.exit(-1)
else:
f = open(fname, 'r')
l1 = f.readline().lower().split(',')
l2 = f.readline().lower().split(',')
print(str(idx) + ': Reading patterns ' + fname + ' for testing\n' + str(l1) + '\n' + str(l2))
#print(' Associate column: ' + columns[idx])
f.close()
#Validation CSV file
# print(' Associate column: ' + columns[idx])
f.close()
# Validation CSV file
# Assumes CSV file has following structure:
# 1) First column is timestamp/time step
# 2) Label column is named "label"
# 3) Window column is named "window"
# 4) For each continous time series with name X, the corresponding columns has name X_D
# 5) Patternset are 1 dimensional
f = open(inputfilename, 'r')
f = open(input_filename, 'r')
columns_csv = f.readline().lower().strip().split(',')
f.close()
f.close()
print('Reading CSVFile ' + str(columns_csv))
if not 'window' in columns_csv:
if 'window' not in columns_csv:
print('Expecting column window')
sys.exit(-1)
if not 'label' in columns_csv:
sys.exit(-1)
if 'label' not in columns_csv:
print('Expecting column label')
sys.exit(-1)
#If discrete column names are pased, fix this
columns = [col if not col.endswith('_d') else col[0:len(col)-2] for col in columns]
sys.exit(-1)
# If discrete column names are pased, fix this
columns = [col if not col.endswith('_d') else col[0:len(col)-2] for col in columns]
for col in columns:
if not col in columns_csv:
if col not in columns_csv:
print('Expecting time series column ' + col)
sys.exit(-1)
if not col + '_d' in columns_csv:
sys.exit(-1)
if col + '_d' not in columns_csv:
print('Excepting time series discretized column with name ' + col + '_d')
sys.exit(-1)
#RUN
#preprocess: create windows for each continuous column, i.e. group by window column in TIPM
sys.exit(-1)
# RUN
# preprocess: create windows for each continuous column, i.e. group by window column in TIPM
# for labels create either 1 (anomaly) if 1 is in window, or -1 (good) if -1 in window and not 1, else 0
#Note: Doing this in plain-old python, instead of using more efficient numpy stuff
df = pd.read_csv(inputfilename, header=0, index_col=0)
# Note: Doing this in plain-old python, instead of using more efficient numpy stuff
df = pd.read_csv(input_filename, header=0, index_col=0)
cols = [c.lower().strip() for c in list(df.columns.values)]
rows = df.values.tolist()
rows = df.values.tolist()
windowIdx = cols.index("window")
labelIdx = cols.index("label")
columnsIdx = [cols.index(col) for col in columns]
discrete_columnsIdx = [cols.index(col+'_d') for col in columns]
discrete_columnsIdx = [cols.index(col+'_d') for col in columns]
group_by_window = defaultdict(list)
current_window = None
windows = list()
for row in rows:
window = row[windowIdx]
if not window in windows:
if window not in windows:
windows.append(window)
group_by_window[window].append(row)
windowed_labels = []
......@@ -180,36 +187,40 @@ if __name__ == '__main__':
windowed_series[i].append(series[i])
for i in range(0, len(discrete_columnsIdx)):
windowed_series_discrete[i].append(discrete_series[i])
#transform to datastructures for PBAD
window_labels=np.array(windowed_labels)
# transform to datastructures for PBAD
window_labels = np.array(windowed_labels)
continuous_data = {}
continuous_data_discretized={}
continuous_data_discretized = {}
for i in range(0, len(columnsIdx)):
continuous_data[i] = windows2numpy(windowed_series[i])
continuous_data_discretized[i] = windows2numpy(windowed_series_discrete[i])
#cont_series = {0: data.iloc[:, 0].values}
#labels = data.iloc[:, 1].values
#cd_D, cd_UD, _, window_labels = preprocess(cont_series, labels=labels)
# cont_series = {0: data.iloc[:, 0].values}
# labels = data.iloc[:, 1].values
# cd_D, cd_UD, _, window_labels = preprocess(cont_series, labels=labels)
# run PBAD, sequential_fnames]:
print('\nRunning PBAD Embed: This computes embedding of patterns, that is a weighted occurrences score for each pattern and each window,' + \
print('\nRunning PBAD Embed: This computes embedding of patterns, that is a weighted occurrences score for each pattern and each window,' +
'and than compute an anomaly score using isolation forests. Patternsets must be provided.')
if itemset_fnames != None:
if itemset_fnames is not None:
itemset_fnames = itemset_fnames.split(',')
if sequential_fnames != None:
if sequential_fnames is not None:
sequential_fnames = sequential_fnames.split(',')
detector = PBAD_Embed(pattern_type=pattern_type, itemset_filenames_cont=itemset_fnames, sp_filenames_cont=sequential_fnames)
scores = detector.fit_predict(continuous_data_discretized, continuous_data)
ixl = np.where(window_labels != 0)[0]
auc = roc_auc_score(y_true=window_labels[ixl], y_score=scores[ixl])
ap = average_precision_score(y_true=window_labels[ixl], y_score=scores[ixl])
print("AUC: {:.3f}".format(auc))
print("AP: {:.3f}".format(ap))
#save score
if score_fname != None:
print("AP: {:.3f}".format(ap))
# save score
if score_fname is not None:
f = open(score_fname, 'w')
f.write("Window,Score\n")
f.write("Window,Score\n")
for idx, win in enumerate(windows):
score = scores[idx]
f.write("{},{:.6f}\n".format(win,score))
f.close()
print("Saved {}".format(score_fname))
\ No newline at end of file
f.write("{},{:.6f}\n".format(win, score))
f.close()
print("Saved {}".format(score_fname))
[flake8]
max-line-length=150
exclude = .git main.py
docstring-convention = numpy
[flake8]
max-line-length=150
exclude = .git
docstring-convention = numpy
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment