added MERLIN

5e6af4c3 · Kristiyan Blagov · 70edeee4 · 5e6af4c3 · 5e6af4c3
Commit 5e6af4c3 authored 1 month ago by Kristiyan Blagov
--- a/MERLIN.py
+++ b/MERLIN.py
+# -*- coding: utf-8 -*-
+"""
+Spyder Editor
+
+This is a temporary script file.
+"""
+
+import pandas as pd
+import numpy as np
+import matlab.engine
+import os
+from statsmodels.tsa.stattools import acf
+from scipy.signal import find_peaks
+import time
+from sklearn.preprocessing import MinMaxScaler
+
+os.chdir(os.getcwd() + "\\UCR_Anomaly_Archive")
+files =  os.listdir()
+lstt = []
+for file in files:
+    if file.endswith(".txt"):
+        lstt.append(file)
+
+#listttt =  os.listdir("C:/Users/Kristiyan/Desktop/Uni/Bachelor Thesis/Archive/UCR_TimeSeriesAnomalyDatasets2021/FilesAreInHere/introducingMERLIN")
+#lstt = []
+#for file in listttt:
+#    if file.endswith(".txt") and file != "qtdbSel100MLII.txt":
+#        lstt.append(file)
+
+# Error on datasets "missing"
+missing = ['239', '240', '241', '084']
+amplitude_change = ["013","014", "037", "042", "044", "053", "057", "066", "091", "100", "104", "121", "122", "145", "150", "152", "161", "165", "174", "199", "205", "215", "217", "246"]
+flat = ['045', '078', '153', '186', '236']
+freq_change = ['023', '026', '032', '033', '034', '040', '048', '099', '101', '131', '134', '140', '141', '142', '148', '156', '202', '222', '223', '224', '227', '228', '229', '244', '245', '247']
+local_drop = ['005', '043', '054', '063', '077', '086', '092', '102', '106', '113', '151', '162', '171', '185', '194', '200', '231', '232', '233', '237', '238']
+local_peak = ['007', '021', '024', '025', '030', '049', '058', '062', '064', '085', '089', '097', '115', '129', '132', '133', '138', '157', '166', '170', '172', '193', '197', '234', '235', '243']
+missing_drop = ['002', '072', '110', '180']
+missing_peak = ['004', '019', '035', '036', '059', '060', '094', '112', '127', '143', '144', '167', '168', '248']
+noise = ['003', '008', '027', '028', '029', '039', '056', '067', '068', '083', '095', '098', '107', '111', '116', '135', '136', '137', '147', '164', '175', '176', '191']
+outlier_datasets = ['011', '012', '015', '016', '017', '018', '070', '071', '096', '119', '120', '123', '124', '125', '126', '178', '179', '192', '213', '216', '220', '226']
+reverse = ['020', '022', '038', '052', '055', '065', '090', '103', '128', '130', '146', '160', '163', '173', '198', '201', '203', '209', '212', '225', '230', '242', '249']
+sampling_rate = ['050', '061', '105', '158', '169']
+signal_shift = ['204']
+smoothed_increase = []
+steep_increase = ['051', '159']
+time_shift = ['069', '074', '075', '079', '080', '081', '082', '087', '088', '108', '177', '182', '183', '187', '188', '189', '190', '195', '196', '206', '207', '208']
+time_warping = ['031', '076', '139', '184']
+unusual_pattern = ['001', '006', '009', '010', '041', '046', '047', '073', '093', '109', '114', '117', '118', '149', '154', '155', '181', '210', '211', '214', '218', '219', '221', '250']
+
+def highest_autocorrelation(ts, min_size=10, max_size=1000):
+    acf_values = acf(ts, fft=True, nlags=int(ts.shape[0]/2))
+
+    peaks, _ = find_peaks(acf_values)
+    peaks = peaks[np.logical_and(peaks >= min_size, peaks < max_size)]
+    corrs = acf_values[peaks]
+    
+    if len(peaks) == 0:
+        peaks, _ = find_peaks(acf_values)
+        peaks = peaks[np.logical_and(peaks >= min_size, peaks < 2000)]
+        corrs = acf_values[peaks]
+        
+    if len(peaks) == 0:
+        return -1
+    
+    return peaks[np.argmax(corrs)]
+
+correct_discord = 0
+small_discord = 0
+big_discord = 0
+correct_discord_list = []
+small_discord_list = []
+big_discord_list = []
+
+correct_discord_dist_lower = []
+correct_discord_dist_higher= []
+small_discord_dist = []
+big_discord_dist = []
+
+start_time = time.time()
+start_process_time = time.process_time()
+
+for dataset in lstt:
+
+    name_split = dataset.split("_")
+    # e.g. "if name_split[0] in flat:" can be used to execute only on time series with flat anomalies
+    if name_split[0] in ["004"]:
+        print(f"Dataset #:{name_split[0]}")
+        data = np.array(pd.read_csv(dataset, header = None))
+        name_split[6] = name_split[6][:-4]
+        
+        scaler = MinMaxScaler()
+        scaled_data = scaler.fit_transform(data)
+        scaled_data = scaled_data.flatten()
+        
+        train_idx = int(name_split[-3])
+        begin = int(name_split[-2])
+        end = int(name_split[-1])
+        
+        eng = matlab.engine.start_matlab()
+        
+        result = np.array(eng.MERLIN3_1(scaled_data[train_idx:], float(75), float(125)))
+
+        count_list = []
+        for k in range(len(result)):
+            count = 0
+            for l in range(len(result)):
+                if k != l:
+                    if np.abs(result[k] - result[l]) <= 100:
+                        count += 1
+            count_list.append(count)
+         
+        if len(count_list) != 0:
+            discord = int(result[np.argmax(count_list)])
+        else:
+            discord = result[25]
+        
+        
+        eng.quit()
+
+        l = end-begin+1
+        discord = discord+train_idx+50
+        print(discord)
+        if min(begin - 100, begin - l) <= discord and discord <= max(end+100, end+l):
+            print("anomaly index correct")
+            correct_discord += 1
+            correct_discord_list.append(name_split[0])
+            correct_discord_dist_lower.append(abs(discord - min(begin - 100, begin - l)))
+            correct_discord_dist_higher.append(abs(discord - max(end+100, end+l)))
+        elif discord < min(begin - 100, begin - l):
+            print("anomaly index too small")
+            small_discord += 1
+            small_discord_list.append(name_split[0])
+            small_discord_dist.append(abs(discord - min(begin - 100, begin - l)))
+        elif discord >max(end+100, end+l):
+            print("anomaly index too big")
+            big_discord += 1
+            big_discord_list.append(name_split[0])
+            big_discord_dist.append(abs(discord - max(end+100, end+l)))
+        else:
+            print(f"problem with dataset {name_split[0]}")
+
+
+end_time = time.time()
+end_process_time = time.process_time()
+runtime = end_time - start_time
+print("Runtime:", runtime, "seconds")
+process_time = end_process_time - start_process_time
+print("Process time:", process_time, "seconds")
--- a/UCR_Anomaly_Archive/MERLIN3_1.m
+++ b/UCR_Anomaly_Archive/MERLIN3_1.m