Commit db9789c2 authored by zelladin's avatar zelladin
Browse files

deleted unneccesary files

parent c8feca40
import torch as th
import numpy as np
import params
import torch.nn.functional as F
......
import params
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#import seaborn as sns
#from imblearn.over_sampling import RandomOverSampler
#from imblearn.under_sampling import RandomUnderSampler
import os
# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import json
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import json
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
import itertools
def splitData(dataset):
"""Split data into test and train data"""
# split labeled data in test and train data
X_train, X_test = train_test_split(dataset, test_size=0.952)
#X_train, X_test = train_test_split(dataset, test_size=0.868)
#X_train, X_test = train_test_split(dataset, test_size=0.903)
#X_train.to_csv(os.path.join(params.save_directory, params.path_train_labeled_Data), index=False)
#X_test.to_csv(os.path.join(params.save_directory, params.path_test_Data), index=False)
#X_train.to_csv("dataset/Superconductivity_20_notnorm/train_norm.csv", index=True, index_label= "index")
#X_test.to_csv("dataset/Superconductivity_20_notnorm/test_norm.csv", index=True, index_label= "index")
#X_train.to_csv("dataset/UTKFACEBOTH/train2.csv", index=True)
#X_test.to_csv("dataset/UTKFACEBOTH/test2.csv", index=True)
return X_train, X_test
def pop_random(lst):
"""get random item from list"""
idx = random.randrange(0, len(lst))
return lst.pop(idx)
def createLabeledPairs(X_train):
"""Create Pairs between the labeled and unlabeled samples"""
pairs_label = []
lst = X_train.index.values.tolist()
#name_of_goal_variable = uncertainty
###each sample in one pair only
#while len(lst) > 1:
# rand1 = pop_random(lst)
# rand2 = pop_random(lst)
# distance = np.abs(X_train.loc[rand1].critical_temp - X_train.loc[rand2].critical_temp)
# pair_1 = (rand1, rand2, distance)
# pair_2 = (rand2, rand1, -distance)
# pairs_label.append(pair_1)
# pairs_label.append(pair_2)
# each sample paired with every other sample
for rand1, rand2 in itertools.combinations_with_replacement(lst, 2):
if rand1 == rand2:
continue
distance = np.abs(X_train.loc[rand1].critical_temp - X_train.loc[rand2].critical_temp)
pair_1 = (rand1, rand2, distance)
pairs_label.append(pair_1)
##take only the 5% with lowest difference as similar, 95% dissimilar
threshold = np.quantile(np.asarray(pairs_label)[:, 2], 0.05)
pairs_similar_index = np.where(np.asarray(pairs_label)[:, 2] <= threshold) ##similar
pairs_dissimilar_index = np.where(np.asarray(pairs_label)[:, 2] > threshold) ##dissimilar
#
pairs_similar = np.take(pairs_label, pairs_similar_index, axis=0)[0, :, :2]
pairs_dissimilar = np.take(pairs_label, pairs_dissimilar_index, axis=0)[0, :, :2]
#
# ##ones for dissimilar, 0 similar based on contrastive loss definition
pairs_dissimilar = np.append(pairs_dissimilar, np.ones((len(pairs_dissimilar), 1)), axis=1)
pairs_similar = np.append(pairs_similar, np.zeros((len(pairs_similar), 1)), axis=1)
#
pairs_labeled_both = np.concatenate((pairs_dissimilar, pairs_similar))
return pairs_labeled_both, pairs_similar, pairs_dissimilar
def createUnlabeledPairs(X_train, X_train_u):
"""Create Pairs between the labeled and unlabeled samples"""
lst_l = X_train.index.values.tolist()
lst_u = X_train_u.index.values.tolist()
pairs_unlabel = []
while len(lst_l) > 1:
rand1 = pop_random(lst_l)
rand2 = pop_random(lst_u)
distance = np.linalg.norm(X_train.loc[rand1] - X_train_u.loc[rand2])
pair = rand1, rand2, distance
pairs_unlabel.append(pair)
lst_u = X_train_u.index.values.tolist()
#while len(lst_u) > 1:
# rand1 = pop_random(lst_u)
# rand2 = pop_random(lst_u)
# distance = np.linalg.norm(X_train_u.loc[rand1] - X_train_u.loc[rand2])
# pair = rand1, rand2, distance
# pairs_unlabel.append(pair)
for rand1, rand2 in itertools.combinations_with_replacement(lst_u, 2):
distance = np.linalg.norm(X_train_u.loc[rand1] - X_train_u.loc[rand2])
pair = rand1, rand2, distance
pairs_unlabel.append(pair)
##take only the 5% with lowest difference as similar, 95% dissimilar
threshold = np.quantile(np.asarray(pairs_unlabel)[:, 2], 0.05)
#threshold = 0
Paires_unlabeled_similar_index = np.where(np.asarray(pairs_unlabel)[:, 2] <= threshold) ##similar
Paires_unlabeled_dissimilar_index = np.where(np.asarray(pairs_unlabel)[:, 2] > threshold) ##dissimilar
pairs_unlabeled_similar = np.take(pairs_unlabel, Paires_unlabeled_similar_index, axis=0)[0, :, :2]
pairs_unlabeled_dissimilar = np.take(pairs_unlabel, Paires_unlabeled_dissimilar_index, axis=0)[0, :, :2]
pairs_unlabeled_dissimilar = np.append(pairs_unlabeled_dissimilar, np.ones((len(pairs_unlabeled_dissimilar), 1)),
axis=1)
pairs_unlabeled_similar = np.append(pairs_unlabeled_similar, np.zeros((len(pairs_unlabeled_similar), 1)), axis=1)
pairs_unlabeled_both = np.concatenate((pairs_unlabeled_dissimilar, pairs_unlabeled_similar))
##ones for dissimilar, 0 similar based on contrastive loss definition
return pairs_unlabeled_both, pairs_unlabeled_similar, pairs_unlabeled_dissimilar
def Oversampling(pairs_labeled, pairs_unlabeled):
"""Random Oversampling
duplicate random samples"""
number_U = len(pairs_unlabeled)
number_L = len(pairs_labeled)
pairs_labeled_oversampling = pairs_labeled.copy().tolist()
while number_U > len(pairs_labeled_oversampling):
idx = random.randrange(0, number_L)
pairs_labeled_oversampling.append(pairs_labeled[idx].tolist())
return np.asarray(pairs_labeled_oversampling)
def UnderSampling(pairs_similar, pairs_dissimilar):
"""Random Undersampling
remove random samples"""
number_S = len(pairs_similar)
df = pd.DataFrame(data=pairs_dissimilar, columns=["column1", "column2", "column3"])
pairs_diss_undersampled = df.sample(number_S)
return pairs_diss_undersampled.to_numpy()
def main():
for i in range(5):
print("start " + str(i))
base_path = "dataset_double_random_different_num_labeled_multiple_oldapproach/50"
path_labeled = "train_labeled_norm.csv"
path_unlabeled = "train_unlabeled_norm.csv"
##read and scale data and save for later
path_data_labeled = os.path.join(base_path,str(i),path_labeled)
path_data_unlabeled = os.path.join(base_path,str(i),path_unlabeled)
data_labeled = pd.read_csv(path_data_labeled,
na_values='?', comment='\t',
sep=',', header=0)
data_unlabeled = pd.read_csv(path_data_unlabeled,
na_values='?', comment='\t',
sep=',', header=0)
data_labeled.set_index("index", inplace=True)
data_unlabeled.set_index("index", inplace=True)
bigdata = pd.concat([data_labeled, data_unlabeled], ignore_index=False, sort=True)
bigdata.to_csv(os.path.join(base_path, str(i), "samples_both_small.csv"),
index=True,
index_label="index")
#train_labeled, test_labeled = splitData(data_labeled)
##drop colum with final variable like critical temp or uncertainty for the unlabeled data
data_unlabeled = data_unlabeled.drop(columns=["critical_temp"])
##create pairs labeled
pairs_labeled, pairs_labeled_similar, pairs_labeled_dissimilar = createLabeledPairs(data_labeled)
##drop for labeled also to build pairs with unlabeled samples
data_l = data_labeled.drop(columns=["critical_temp"])
pairs_unlabeled, pairs_unlabeled_similar, pairs_unlabeled_dissimilar = createUnlabeledPairs(data_l,
data_unlabeled, )
###oversample labeled pairs until |labeled| == |unlabeled|
P_L = Oversampling(pairs_labeled, pairs_unlabeled)
##split labeled pairs in dissimilar and similar pairs
S_L_index = np.where(np.asarray(P_L)[:, 2] == 0) ## get indices for similar
U_L_index = np.where(np.asarray(P_L)[:, 2] > 0) ## get indices fordissimilar
S_L = np.take(P_L, S_L_index, axis=0)[0, :] # similar labeled pairs
D_L = np.take(P_L, U_L_index, axis=0)[0, :] # dissimilar labeled pairs
D_LU = np.concatenate((pairs_unlabeled_dissimilar, D_L))
S_LU = np.concatenate((pairs_unlabeled_similar, S_L))
##undersampling until |similar| == |dissimilar>
D_LU = UnderSampling(S_LU, D_LU)
##concatenate and finished
P = np.concatenate((S_LU, D_LU))
##save final pairs
path_pairs = "pairs_combined_small.csv"
np.savetxt(os.path.join(base_path,str(i),path_pairs), P, delimiter=",")
df_eval = pd.DataFrame.from_records(
P, columns=['Sample1', "Sample2", 'Difference'])
df_eval.to_csv(os.path.join(base_path,str(i),path_pairs),index=False, index_label="index")
if __name__ == "__main__":
main()
print("finished")
import params
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
# from imblearn.over_sampling import RandomOverSampler
# from imblearn.under_sampling import RandomUnderSampler
import os
# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import json
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import json
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
import itertools
import cv2 as cv
def pop_random(lst):
"""get random item from list"""
idx = random.randrange(0, len(lst))
return lst.pop(idx)
def createLabeledPairs(X_train):
"""Create Pairs between the labeled and unlabeled samples"""
pairs_label = []
lst = X_train.index.values.tolist()
# name_of_goal_variable = uncertainty
###each sample in one pair only
# while len(lst) > 0:
# rand1 = pop_random(lst)
# rand2 = pop_random(lst)
# distance = X_train.loc[rand1].critical_temp - X_train.loc[rand2].critical_temp
# pair_1 = (rand1, rand2, distance)
# pair_2 = (rand2, rand1, -distance)
# pairs_label.append(pair_1)
# pairs_label.append(pair_2)
# each sample paired with every other sample
for rand1, rand2 in itertools.combinations_with_replacement(lst, 2):
if rand1 == rand2:
continue
distance = X_train.loc[rand1].crowd_counting - X_train.loc[rand2].crowd_counting
pair_1 = (rand1, rand2, distance)
pair_2 = (rand2, rand1, -distance)
pairs_label.append(pair_1)
pairs_label.append(pair_2)
##take only the 5% with lowest difference as similar, 95% dissimilar
# threshold = np.quantile(np.asarray(pairs_label)[:, 2], 0.05)
# pairs_similar_index = np.where(np.asarray(pairs_label)[:, 2] <= threshold) ##similar
# pairs_dissimilar_index = np.where(np.asarray(pairs_label)[:, 2] > threshold) ##dissimilar
#
# pairs_similar = np.take(pairs_label, pairs_similar_index, axis=0)[0, :, :2]
# pairs_dissimilar = np.take(pairs_label, pairs_dissimilar_index, axis=0)[0, :, :2]
#
# ##ones for dissimilar, 0 similar based on contrastive loss definition
# pairs_dissimilar = np.append(pairs_dissimilar, np.ones((len(pairs_dissimilar), 0)), axis=0)
# pairs_similar = np.append(pairs_similar, np.zeros((len(pairs_similar), 0)), axis=0)
#
# pairs_labeled_both = np.concatenate((pairs_dissimilar, pairs_similar))
# return pairs_labeled_both, pairs_similar, pairs_dissimilar
return pairs_label
def loadimage(X_train, rand1):
base_path = "dataset/ShanghaiTech/part_B/train_data/images"
path = X_train.loc[rand1].path
img = cv.imread(os.path.join(base_path, path))
norm_img = np.zeros_like(img)
final_img = cv.normalize(img, norm_img, 0, 255, cv.NORM_MINMAX) / 255
return final_img
def createUnlabeledPairs(X_train, X_train_u):
"""Create Pairs between the labeled and unlabeled samples"""
lst_l = X_train.index.values.tolist()
lst_u = X_train_u.index.values.tolist()
pairs_unlabel = []
while len(lst_l) > 1:
rand1 = pop_random(lst_l)
rand2 = pop_random(lst_u)
final_img_rand1 = loadimage(X_train, rand1)
final_img_rand2 = loadimage(X_train_u, rand2)
##load image and then compute difference
distance = np.linalg.norm(final_img_rand1 - final_img_rand2)
pair = rand1, rand2, distance
pairs_unlabel.append(pair)
lst_u = X_train_u.index.values.tolist()
# while len(lst_u) > 1:
# rand1 = pop_random(lst_u)
# rand2 = pop_random(lst_u)
# distance = np.linalg.norm(X_train_u.loc[rand1] - X_train_u.loc[rand2])
# pair = rand1, rand2, distance
# pairs_unlabel.append(pair)
for rand1, rand2 in itertools.combinations_with_replacement(lst_u, 2):
if rand1 == rand2:
continue
final_img_rand1 = loadimage(X_train, rand1)
final_img_rand2 = loadimage(X_train_u, rand2)
distance = np.linalg.norm(final_img_rand1 - final_img_rand2)
pair = rand1, rand2, distance
pairs_unlabel.append(pair)
##take only the 5% with lowest difference as similar, 95% dissimilar
threshold = np.quantile(np.asarray(pairs_unlabel)[:, 2], 0.05)
# threshold = 0
Paires_unlabeled_similar_index = np.where(np.asarray(pairs_unlabel)[:, 2] <= threshold) ##similar
Paires_unlabeled_dissimilar_index = np.where(np.asarray(pairs_unlabel)[:, 2] > threshold) ##dissimilar
pairs_unlabeled_similar = np.take(pairs_unlabel, Paires_unlabeled_similar_index, axis=0)[0, :, :2]
pairs_unlabeled_dissimilar = np.take(pairs_unlabel, Paires_unlabeled_dissimilar_index, axis=0)[0, :, :2]
pairs_unlabeled_dissimilar = np.append(pairs_unlabeled_dissimilar, np.ones((len(pairs_unlabeled_dissimilar), 1)),
axis=1)
pairs_unlabeled_similar = np.append(pairs_unlabeled_similar, np.zeros((len(pairs_unlabeled_similar), 1)), axis=1)
pairs_unlabeled_both = np.concatenate((pairs_unlabeled_dissimilar, pairs_unlabeled_similar))
##ones for dissimilar, 0 similar based on contrastive loss definition
return pairs_unlabeled_both, pairs_unlabeled_similar, pairs_unlabeled_dissimilar
def Oversampling(pairs_labeled, pairs_unlabeled):
"""Random Oversampling
duplicate random samples"""
number_U = len(pairs_unlabeled)
number_L = len(pairs_labeled)
pairs_labeled_oversampling = pairs_labeled.copy().tolist()
while number_U > len(pairs_labeled_oversampling):
idx = random.randrange(0, number_L)
pairs_labeled_oversampling.append(pairs_labeled[idx].tolist())
return np.asarray(pairs_labeled_oversampling)
def UnderSampling(pairs_similar, pairs_dissimilar):
"""Random Undersampling
remove random samples"""
number_S = len(pairs_similar)
df = pd.DataFrame(data=pairs_dissimilar, columns=["column1", "column2", "column3"])
pairs_diss_undersampled = df.sample(number_S)
return pairs_diss_undersampled.to_numpy()
def main_2():
##read and scale data and save for later
path_data_labeled = "dataset/ShanghaiTech/part_B/train_data/train.csv"
data_labeled = pd.read_csv(path_data_labeled,
na_values=-200, comment='\t',
sep=',', header=0,na_filter=True)
data_labeled.set_index("index", inplace= True)
for i in [100]:
for j in range(5):
start_path = "dataset_double_random_ShanghaiTech/"
directory_name = "{}/{}".format(i,j)
if not os.path.exists(os.path.join(start_path, directory_name)):
os.makedirs(os.path.join(start_path, directory_name))
base_path = os.path.join(start_path, directory_name)
train_labeled = data_labeled.sample(i)
rest = data_labeled.drop(train_labeled.index)
evaluation = rest.sample(100)
rest_labeled = rest.drop(evaluation.index)
train_unlabeled = rest_labeled
#test_labeled = rest_labeled.drop(train_unlabeled.index)
path_train_labeled = "train_labeled_norm.csv"
path_test_labeled = "test_norm.csv"
path_train_unlabeled = "train_unlabeled_norm.csv"
path_evaluation = "evaluation.csv"
train_labeled.to_csv(os.path.join(base_path, path_train_labeled), index=True, index_label="index")
#test_labeled.to_csv(os.path.join(base_path, path_test_labeled), index=True, index_label="index")
train_unlabeled.to_csv(os.path.join(base_path, path_train_unlabeled), index=True, index_label="index")
evaluation.to_csv(os.path.join(base_path, path_evaluation), index=True, index_label="index")
pairs_labeled = np.asarray(createLabeledPairs(train_labeled))
pairs_evaluation = np.asarray(createLabeledPairs(evaluation))
path_pairs_labeled = "pairs_norm.csv"
path_evaluation_pairs = "pairs_eval_norm.csv"
df = pd.DataFrame.from_records(
pairs_labeled, columns=['Sample1', "Sample2", 'Difference'])
df.to_csv(os.path.join(base_path, path_pairs_labeled), index=False, index_label="index")
df_eval = pd.DataFrame.from_records(
pairs_evaluation, columns=['Sample1', "Sample2", 'Difference'])
df_eval.to_csv(os.path.join(base_path, path_evaluation_pairs), index=False, index_label="index")
if __name__ == "__main__":
main_2()
print("finished")
import params
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import os
# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import tensorflow as tf
import json
import geopandas as gpd
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random
# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import tensorflow as tf
import json
import geopandas as gpd
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
import itertools
print(tf.__version__)
def splitData(dataset):
"""Split data into test and train data"""
# split labeled data in test and train data
X_train, X_test = train_test_split(dataset, test_size=0.952)
# X_train, X_test = train_test_split(dataset, test_size=0.868)
# X_train, X_test = train_test_split(dataset, test_size=0.903)
# X_train.to_csv(os.path.join(params.save_directory, params.path_train_labeled_Data), index=False)
# X_test.to_csv(os.path.join(params.save_directory, params.path_test_Data), index=False)
# X_train.to_csv("dataset/Superconductivity_20_notnorm/train_norm.csv", index=True, index_label= "index")
# X_test.to_csv("dataset/Superconductivity_20_notnorm/test_norm.csv", index=True, index_label= "index")
# X_train.to_csv("dataset/UTKFACEBOTH/train2.csv", index=True)
# X_test.to_csv("dataset/UTKFACEBOTH/test2.csv", index=True)
return X_train, X_test
def pop_random(lst):
"""get random item from list"""
idx = random.randrange(0, len(lst))
return lst.pop(idx)
def createLabeledPairs(X_train):
"""Create Pairs between the labeled and unlabeled samples"""
pairs_label = []
lst = X_train.index.values.tolist()
# name_of_goal_variable = uncertainty
###each sample in one pair only
# while len(lst) > 1:
# rand1 = pop_random(lst)
# rand2 = pop_random(lst)
# distance = np.abs(X_train.loc[rand1].critical_temp - X_train.loc[rand2].critical_temp)
# pair_1 = (rand1, rand2, distance)
# #pair_2 = (rand2, rand1, -distance)
# pairs_label.append(pair_1)
# #pairs_label.append(pair_2)
# each sample paired with every other sample
for rand1, rand2 in itertools.combinations_with_replacement(lst, 2):
if rand1 == rand2:
continue
distance = np.abs(X_train.loc[rand1].critical_temp - X_train.loc[rand2].critical_temp)
pair_1 = (rand1,rand2, distance)
pairs_label.append(pair_1)
##take only the 5% with lowest difference as similar, 95% dissimilar
threshold = np.quantile(np.asarray(pairs_label)[:, 2], 0.05)
pairs_similar_index = np.where(np.asarray(pairs_label)[:, 2] <= threshold) ##similar
pairs_dissimilar_index = np.where(np.asarray(pairs_label)[:, 2] > threshold) ##dissimilar
#
pairs_similar = np.take(pairs_label, pairs_similar_index, axis=0)[0, :, :2]
pairs_dissimilar = np.take(pairs_label, pairs_dissimilar_index, axis=0)[0, :, :2]
#
# ##ones for dissimilar, 0 similar based on contrastive loss definition
pairs_dissimilar = np.append(pairs_dissimilar, np.ones((len(pairs_dissimilar), 1)), axis=1)
pairs_similar = np.append(pairs_similar, np.zeros((len(pairs_similar), 1)), axis=1)
#
pairs_labeled_both = np.concatenate((pairs_dissimilar, pairs_similar))
return pairs_labeled_both, pairs_similar, pairs_dissimilar
# return pairs_label
def createUnlabeledPairs(X_train, X_train_u):
"""Create Pairs between the labeled and unlabeled samples"""
lst_l = X_train.index.values.tolist()
lst_u = X_train_u.index.values.tolist()
pairs_unlabel = []
while len(lst_l) > 1:
rand1 = pop_random(lst_l)
rand2 = pop_random(lst_u)