Skip to content
Snippets Groups Projects
Commit 5e6af4c3 authored by Kristiyan Blagov's avatar Kristiyan Blagov
Browse files

added MERLIN

parent 70edeee4
No related branches found
No related tags found
No related merge requests found
MERLIN.py 0 → 100644
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import pandas as pd
import numpy as np
import matlab.engine
import os
from statsmodels.tsa.stattools import acf
from scipy.signal import find_peaks
import time
from sklearn.preprocessing import MinMaxScaler
os.chdir(os.getcwd() + "\\UCR_Anomaly_Archive")
files = os.listdir()
lstt = []
for file in files:
if file.endswith(".txt"):
lstt.append(file)
#listttt = os.listdir("C:/Users/Kristiyan/Desktop/Uni/Bachelor Thesis/Archive/UCR_TimeSeriesAnomalyDatasets2021/FilesAreInHere/introducingMERLIN")
#lstt = []
#for file in listttt:
# if file.endswith(".txt") and file != "qtdbSel100MLII.txt":
# lstt.append(file)
# Error on datasets "missing"
missing = ['239', '240', '241', '084']
amplitude_change = ["013","014", "037", "042", "044", "053", "057", "066", "091", "100", "104", "121", "122", "145", "150", "152", "161", "165", "174", "199", "205", "215", "217", "246"]
flat = ['045', '078', '153', '186', '236']
freq_change = ['023', '026', '032', '033', '034', '040', '048', '099', '101', '131', '134', '140', '141', '142', '148', '156', '202', '222', '223', '224', '227', '228', '229', '244', '245', '247']
local_drop = ['005', '043', '054', '063', '077', '086', '092', '102', '106', '113', '151', '162', '171', '185', '194', '200', '231', '232', '233', '237', '238']
local_peak = ['007', '021', '024', '025', '030', '049', '058', '062', '064', '085', '089', '097', '115', '129', '132', '133', '138', '157', '166', '170', '172', '193', '197', '234', '235', '243']
missing_drop = ['002', '072', '110', '180']
missing_peak = ['004', '019', '035', '036', '059', '060', '094', '112', '127', '143', '144', '167', '168', '248']
noise = ['003', '008', '027', '028', '029', '039', '056', '067', '068', '083', '095', '098', '107', '111', '116', '135', '136', '137', '147', '164', '175', '176', '191']
outlier_datasets = ['011', '012', '015', '016', '017', '018', '070', '071', '096', '119', '120', '123', '124', '125', '126', '178', '179', '192', '213', '216', '220', '226']
reverse = ['020', '022', '038', '052', '055', '065', '090', '103', '128', '130', '146', '160', '163', '173', '198', '201', '203', '209', '212', '225', '230', '242', '249']
sampling_rate = ['050', '061', '105', '158', '169']
signal_shift = ['204']
smoothed_increase = []
steep_increase = ['051', '159']
time_shift = ['069', '074', '075', '079', '080', '081', '082', '087', '088', '108', '177', '182', '183', '187', '188', '189', '190', '195', '196', '206', '207', '208']
time_warping = ['031', '076', '139', '184']
unusual_pattern = ['001', '006', '009', '010', '041', '046', '047', '073', '093', '109', '114', '117', '118', '149', '154', '155', '181', '210', '211', '214', '218', '219', '221', '250']
def highest_autocorrelation(ts, min_size=10, max_size=1000):
acf_values = acf(ts, fft=True, nlags=int(ts.shape[0]/2))
peaks, _ = find_peaks(acf_values)
peaks = peaks[np.logical_and(peaks >= min_size, peaks < max_size)]
corrs = acf_values[peaks]
if len(peaks) == 0:
peaks, _ = find_peaks(acf_values)
peaks = peaks[np.logical_and(peaks >= min_size, peaks < 2000)]
corrs = acf_values[peaks]
if len(peaks) == 0:
return -1
return peaks[np.argmax(corrs)]
correct_discord = 0
small_discord = 0
big_discord = 0
correct_discord_list = []
small_discord_list = []
big_discord_list = []
correct_discord_dist_lower = []
correct_discord_dist_higher= []
small_discord_dist = []
big_discord_dist = []
start_time = time.time()
start_process_time = time.process_time()
for dataset in lstt:
name_split = dataset.split("_")
# e.g. "if name_split[0] in flat:" can be used to execute only on time series with flat anomalies
if name_split[0] in ["004"]:
print(f"Dataset #:{name_split[0]}")
data = np.array(pd.read_csv(dataset, header = None))
name_split[6] = name_split[6][:-4]
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)
scaled_data = scaled_data.flatten()
train_idx = int(name_split[-3])
begin = int(name_split[-2])
end = int(name_split[-1])
eng = matlab.engine.start_matlab()
result = np.array(eng.MERLIN3_1(scaled_data[train_idx:], float(75), float(125)))
count_list = []
for k in range(len(result)):
count = 0
for l in range(len(result)):
if k != l:
if np.abs(result[k] - result[l]) <= 100:
count += 1
count_list.append(count)
if len(count_list) != 0:
discord = int(result[np.argmax(count_list)])
else:
discord = result[25]
eng.quit()
l = end-begin+1
discord = discord+train_idx+50
print(discord)
if min(begin - 100, begin - l) <= discord and discord <= max(end+100, end+l):
print("anomaly index correct")
correct_discord += 1
correct_discord_list.append(name_split[0])
correct_discord_dist_lower.append(abs(discord - min(begin - 100, begin - l)))
correct_discord_dist_higher.append(abs(discord - max(end+100, end+l)))
elif discord < min(begin - 100, begin - l):
print("anomaly index too small")
small_discord += 1
small_discord_list.append(name_split[0])
small_discord_dist.append(abs(discord - min(begin - 100, begin - l)))
elif discord >max(end+100, end+l):
print("anomaly index too big")
big_discord += 1
big_discord_list.append(name_split[0])
big_discord_dist.append(abs(discord - max(end+100, end+l)))
else:
print(f"problem with dataset {name_split[0]}")
end_time = time.time()
end_process_time = time.process_time()
runtime = end_time - start_time
print("Runtime:", runtime, "seconds")
process_time = end_process_time - start_process_time
print("Process time:", process_time, "seconds")
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment