removed STAN.py

30fd03aa · Kristiyan Blagov · 1fc3ec66 · 1fc3ec66
Commit 30fd03aa authored 1 month ago by Kristiyan Blagov
--- a/STAN.py
+++ b/STAN.py
-# -*- coding: utf-8 -*-
-"""
-Created on Sat Jan 11 22:22:47 2025
-
-@author: Kristiyan
-"""
-
-import numpy as np
-import pandas as pd
-#import matplotlib.pyplot as plt
-#import statsmodels.api as sm
-from statsmodels.tsa.stattools import acf
-from scipy.signal import find_peaks
-import scipy.stats as st
-from sklearn.preprocessing import MinMaxScaler
-import math
-import os
-import time
-from collections import defaultdict
-
-# The IDs of UCR Anomaly Archive time series sorted into arrays of corresponding anomaly type
-amplitude_change = ["013","014", "037", "042", "044", "053", "057", "066", "091", "100", "104", "121", "122", "145", "150", "152", "161", "165", "174", "199", "205", '206', '207', "215", "217"]
-flat = ['045', '078', '110', '153', '186', '236']
-freq_change = ['023', '026', '032', '033', '034', '040', '048', '099', '101', '131', '134', '140', '141', '142', '148', '156', '202', '222', '223', '224', '227', '228', '229', '244', '245', "246", '247']
-local_drop = ['005', '043', '054', '063', '077', '086', '092', '102', '106', '113', '151', '162', '171', '185', '194', '200', '232', '233', '237', '238']
-local_peak = ['007', '021', '024', '025', '030', '049', '062', '064', '085', '089', '097', '115', '129', '132', '133', '138', '157', '170', '172', '193', '197', '234', '235', '239', '243']
-missing_drop = ['002', '072', '180']
-missing_peak = ['004', '019', '035', '036', '059', '060', '094', '112', '127', '143', '144', '167', '168', '248']
-noise = ['003', '008', '027', '028', '029', '039', '056', '067', '068', '083', '095', '098', '107', '111', '116', '135', '136', '137', '147', '164', '175', '176', '191']
-outlier_datasets = ['011', '012', '015', '018', '070', '071', '084', '119', '120', '123', '126', '178', '179', '192', '213', '216', '220', '226']
-reverse = ['020', '022', '038', '052', '055', '065', '090', '103', '128', '130', '146', '160', '163', '173', '198', '201', '203', '209', '212', '225', '230', '242', '249']
-reverse_horizontal = ['016', '017', '058', '096', '124', '125', '166', '231']
-sampling_rate = ['050', '061', '105', '158', '169']
-signal_shift = ['204']
-smoothed_increase = ['241']
-steep_increase = ['051', '159']
-time_shift = ['069', '074', '075', '079', '080', '081', '082', '087', '088', '108', '177', '182', '183', '187', '188', '189', '190', '195', '196', '208']
-time_warping = ['031', '076', '139', '184']
-unusual_pattern = ['001', '006', '009', '010', '041', '046', '047', '073', '093', '109', '114', '117', '118', '149', '154', '155', '181', '210', '211', '214', '218', '219', '221', '240', '250']
-
-
-def count_direction_changes(subseq):
-    count = 0
-    direction = "unchanged"
-    
-    for i in range(1,len(subseq)):
-        
-        if subseq[i] - subseq[i-1] > 0:
-            new_direction = "up"
-        elif subseq[i] - subseq[i-1] < 0:
-            new_direction = "down"
-        else:
-            new_direction = "unchanged"
-        
-        if new_direction != direction:
-            count += 1
-            
-        direction = new_direction
-        
-    return count
-
-def point_anomaly(subseq):
-    return np.std(subseq)
-
-def highest_autocorrelation(ts, min_size=10, max_size=1000):
-    acf_values = acf(ts, fft=True, nlags=int(ts.shape[0]/2))
-
-    peaks, _ = find_peaks(acf_values)
-    peaks = peaks[np.logical_and(peaks >= min_size, peaks < max_size)]
-    corrs = acf_values[peaks]
-    
-    if len(peaks) == 0:
-        peaks, _ = find_peaks(acf_values)
-        peaks = peaks[np.logical_and(peaks >= min_size, peaks < 2000)]
-        corrs = acf_values[peaks]
-        
-    if len(peaks) == 0:
-        return -1
-    
-    return peaks[np.argmax(corrs)]
-
-os.chdir(os.getcwd() + "\\UCR_Anomaly_Archive")
-files =  os.listdir()
-time_series_list = []
-for file in files:
-    if file.endswith(".txt"):
-        time_series_list.append(file)
-        
-
-ult_outlier = []
-ult_correct = 0
-wrong_datasets = []
-wrong_factors = []
-wrong_outlier_idx = []
-bigger_dist = []
-smaller_dist = []
-to_upper_bound = []
-to_lower_bound = []
-turnpts = []
-correct_ts = []
-incorrect_ts = []
-
-
-sum_stat_results = defaultdict(int)
-
-
-summary_statistics= [np.min, np.max, np.mean, np.std, st.skew, st.kurtosis, count_direction_changes, point_anomaly]
-        
-start_time = time.time()
-
-for dataset in time_series_list:
-    
-    name_split = dataset.split("_")
-    
-    # To terate over time series with a specific anomaly type:
-    # "if name_split[0] in amplitude_change:", "if name_split[0] in flat:",  ...
-    if name_split[0]:
-        
-        data = np.array(pd.read_csv(dataset, header = None)).flatten()
-        name_split[6] = name_split[6][:-4]
-        
-        y_line = np.mean([int(name_split[-1]), int(name_split[-2])])
-          
-        period_length = highest_autocorrelation(data)
-            
-        #begin index, end index and length of ground truth anomaly  
-        begin = int(name_split[-2])
-        end = int(name_split[-1])
-        l = end-begin+1
-            
-        data_original = data
-        data = np.diff(data)
-
-        factor = []
-        outlier_idx = []
-        #powers_of_2 = [period_length]
-        
-        
-        for f in summary_statistics:
-            
-            if f.__name__ == "count_direction_changes":
-                data = data_original
-                
-            if f.__name__ == "point_anomaly":
-                period_length = 3
-                
-            agg_data = []
-            for j in range(len(data)):
-                if j % period_length == 0 and (j+period_length) <=len(data):
-                    agg_data.append(f(data[j:j+period_length]))
-            
-            
-            #Scaling the data in range [0,1]
-            data_to_scale = [[x] for x in agg_data]
-            scaler = MinMaxScaler()
-            scaled_data = scaler.fit_transform(data_to_scale)
-            scaled_data = scaled_data.flatten()
-            
-            
-            train_test_idx = math.floor((int(name_split[-3])/period_length))
-            
-            # Set the index of either the maximal or the minimal value from
-            # the test dataset of the transformed data as an outlier index 
-            if np.max(scaled_data[train_test_idx:]) - np.mean(scaled_data) >= np.mean(scaled_data) - np.min(scaled_data[train_test_idx:]):
-                outlier = np.argmax(scaled_data[train_test_idx:]) + train_test_idx
-            else:
-                outlier = np.argmin(scaled_data[train_test_idx:]) + train_test_idx
-                
-
-                
-            # Save the anomaly detection accuracy for each summary statistic
-            if outlier*period_length+period_length/2 > min(begin-l,begin-100) and outlier*period_length+period_length/2 < max(end+l,end+100):
-                sum_stat_results[f.__name__] += 1
-            
-            #Maximal and minimal values in the train set, used for comparison in next steps
-            sort_scaled = sorted(scaled_data[:train_test_idx])
-            scaled_lower_bound = sort_scaled[0]
-            scaled_upper_bound = sort_scaled[-1]
-            
-            # If the detected outlier index is within the test dataset then
-            # it will be disregarded 
-            if outlier <= train_test_idx:
-                factor.append(0)
-                outlier_idx.append(0)
-            # Save the index of the outlier plus the its difference with the
-            # max value of the train dataset
-            elif scaled_data[outlier] > scaled_upper_bound:
-                factor.append(scaled_data[outlier] - scaled_upper_bound)
-                outlier_idx.append(outlier*period_length+period_length/2)
-            # Save the index of the outlier plus the its difference with the
-            # min value of the train dataset    
-            elif scaled_data[outlier] < scaled_lower_bound:
-                factor.append(scaled_lower_bound - scaled_data[outlier])
-                outlier_idx.append(outlier*period_length+period_length/2)
-            # If the detected outlier is in the test dataset, but is smaller
-            # than the max and bigger then the min of the train dataset
-            else:
-                factor.append(0)
-                outlier_idx.append(outlier*period_length+period_length/2)
-                
-                
-            if f.__name__ == "count_direction_changes":
-                data = np.diff(data)
-                
-            if f.__name__ == "point_anomaly":
-                period_length = highest_autocorrelation(data)
-            
-        # Pick the anomaly index with largest factor        
-        max_fact = np.argmax(factor)
-        ult_outlier.append(outlier_idx[max_fact])
-        
-        # Check if the anomaly index is within the range of the ground truth
-        if outlier_idx[max_fact] >= min(begin-l,begin-100) and outlier_idx[max_fact] <= max(end+l,end+100):
-            ult_correct += 1
-        
-        # The commented lines below were used for experimenting    
-            #to_lower_bound.append(outlier_idx[max_fact] - min(begin-l,begin-100))
-            #to_upper_bound.append(max(end+l,end+100) - outlier_idx[max_fact])
-            #correct_ts.append(name_split[0])
-        #elif outlier_idx[max_fact] < min(begin-l,begin-100):
-        #    smaller_dist.append(min(begin-l,begin-100) - outlier_idx[max_fact])
-        #    incorrect_ts.append(name_split[0])
-        #elif outlier_idx[max_fact] > max(end+l,end+100):
-        #    bigger_dist.append(outlier_idx[max_fact] - max(end+l,end+100))
-        #    incorrect_ts.append(name_split[0])
-        #else:
-        #    wrong_factors.append(factor)
-        #    wrong_outlier_idx.append(outlier_idx)
-        #    wrong_datasets.append(name_split[0])
-        #    incorrect_ts.append(name_split[0])
-            
-                
-                
-
-end_time = time.time()
-runtime = end_time - start_time
-print("Total Runtime:", runtime, "seconds")
-print("Runtime per Dataset:", runtime/250, "seconds")
-
-print("Correctly detected anomalies:", ult_correct, "out of 250")
-#print(correct_ts)
-#print(incorrect_ts)
-print("Correctly detected anomalies per summary statistic:")
-print(sum_stat_results)
-
-#print(factor)
-#print(outlier_idx)
-#print(ult_outlier)