Commit cc8b7944 authored by zelladin's avatar zelladin
Browse files

add all code

parent f138c7ae
/dataset/UTKFACEBOTH/UTKFace/
__pycache__/
Save/
Saves/
.idea/
venv/
triplets/*
Saves/
dataset*/
triplets
*.png
# Author: https://github.com/AntixK
# Modified by: Adina
dataset_params:
dataset_path: "dataset_double_random_ShanghaiTech"
number: "0"
img_size: 81
batch_size: 64
model_params:
name: 'SiameseNew'
number_of_neurons: 100
in_channels: 81
latent_dim: 128
loss_type: 'B'
gamma: 10.0
max_capacity: 25
Capacity_max_iter: 10000
exp_params:
img_size: 81
batch_size: 64
epochs: 0
LR: 0.001
weight_decay: 0.0
scheduler_gamma: 0.95
trainer_params:
gpus: 0
max_nb_epochs: 300
max_epochs: 300
logging_params:
save_dir: "Saves/"
approach: "Shanghai"
name: "test_again"
manual_seed: 1265
date : ""
# Author: https://github.com/AntixK
# Modified by: Adina
dataset_params:
dataset_path: "dataset/Superconductivity_representive_double"
img_size: 81
batch_size: 64
model_params:
name: 'SiameseNew'
number_of_neurons: 100
in_channels: 81
latent_dim: 128
loss_type: 'B'
gamma: 10.0
max_capacity: 25
Capacity_max_iter: 10000
exp_params:
img_size: 81
batch_size: 64
epochs: 30
LR: 0.001
weight_decay: 0.0
scheduler_gamma: 0.95
trainer_params:
gpus: 0
max_nb_epochs: 300
max_epochs: 300
logging_params:
save_dir: "Saves/"
approach: "Approach3_representive_set_exp"
name: "randomtest"
manual_seed: 1265
date : ""
import torch as th
import numpy as np
import params
import torch.nn.functional as F
class ContrastiveLoss(th.nn.Module):
"""Contrastive loss function"""
def __init__(self, margin=2.0):
super(ContrastiveLoss, self).__init__()
self.margin = margin
def forward(self, output1, output2, label):
euclidean_distance = F.pairwise_distance(output1, output2)
#loss_contrastive = th.mean((0 - label) * th.pow(euclidean_distance, 2) +
# (label) * th.pow(th.clamp(self.margin - euclidean_distance, min=0.0), 2))
loss = th.mean(1 / 2 * (1-label) * th.pow(euclidean_distance, 2) +
1 / 2 * (label) * th.pow(th.clamp(self.margin - euclidean_distance, min=0.0), 2))
return loss
class DoubleMarginSiameseLoss(th.nn.Module):
"""Double Margin Siamese Loss function https://arxiv.org/pdf/1903.03238.pdf"""
def __init__(self, margin_1=2.0, margin_2 = 10.0):
super(DoubleMarginSiameseLoss, self).__init__()
self.margin_1 = margin_1
self.margin_2 = margin_2
def forward(self, output1, output2, label):
euclidean_distance = F.pairwise_distance(output1, output2)
similar_part = F.relu(euclidean_distance - self.margin_1)
dissimilar_part = F.relu(self.margin_2-euclidean_distance)
loss = th.mean(1 / 2 * (1-label) * similar_part +
1 / 2 * (label) * dissimilar_part)
return loss
class RankedListLoss(th.nn.Module):
"""Double Margin Siamese Loss function https://arxiv.org/pdf/1903.03238.pdf"""
def __init__(self, margin =2.0, boundary = 5, Tp = -0.5, Tn = -0.5):
super(RankedListLoss, self).__init__()
self.margin = 35
self.boundary = 40
self.Tp = Tp
self.Tn = Tn
def forward(self, distance_positive, distance_negative, label):
#self.boundary = 0.8*th.max(145-label, th.abs(0-label))
#self.margin = self.boundary - 0
distance_positive = th.abs(distance_positive)
distance_negative = th.abs(distance_negative)
weight_negativ = th.exp(self.Tn* (self.boundary-distance_negative))
weight_positiv = th.exp(self.Tp* (distance_positive-(self.boundary-self.margin)))
positive_pairs = F.relu(distance_positive-(self.boundary-self.margin))
negative_pairs = F.relu(self.boundary-distance_negative)
loss_positive = th.sum((weight_positiv/th.sum(weight_positiv))*positive_pairs)
loss_negative = th.sum((weight_negativ / th.sum(weight_negativ)) * negative_pairs)
loss = th.mean(1 / 2 * loss_positive +
1 / 2 * loss_negative)
return loss
class RankedListLoss_old(th.nn.Module):
"""Double Margin Siamese Loss function https://arxiv.org/pdf/1903.03238.pdf"""
def __init__(self, margin =2.0, boundary = 5, Tp = -0.5, Tn = -0.5):
super(RankedListLoss_old, self).__init__()
self.margin = 0.8
self.boundary = 0.9
self.Tp = Tp
self.Tn = Tn
def forward(self, anchor, positive, negative):
distance_positive = F.pairwise_distance(anchor, positive)
distance_negative = F.pairwise_distance(anchor, negative)
weight_negativ = th.exp(self.Tn* (self.boundary-distance_negative))
weight_positiv = th.exp(self.Tp* (distance_positive-(self.boundary-self.margin)))
positive_pairs = F.relu(distance_positive-(self.boundary-self.margin))
negative_pairs = F.relu(self.boundary-distance_negative)
loss_positive = th.sum((weight_positiv/th.sum(weight_positiv))*positive_pairs)
loss_negative = th.sum((weight_negativ / th.sum(weight_negativ)) * negative_pairs)
loss = th.mean(1 / 2 * loss_positive +
1 / 2 * loss_negative)
return loss
class RevisedContrastiveLoss(th.nn.Module):
"""Contrastive loss function"""
def __init__(self, margin=2.0):
super(RevisedContrastiveLoss, self).__init__()
self.margin = margin
def forward(self, output, label):
self.margin = label
#euclidean_distance = F.pairwise_distance(output1, output2)
#loss_contrastive = th.mean((0 - label) * th.pow(euclidean_distance, 2) +
# (label) * th.pow(th.clamp(self.margin - euclidean_distance, min=0.0), 2))
loss = th.mean(1 / 2 * th.pow(th.clamp(self.margin - output, min=0.0), 2))
return loss
class MSE(th.nn.Module):
def __init__(self, margin=2.0):
super(MSE, self).__init__()
self.margin = margin
def forward(self, input, output):
return th.mean((input - output) ** 2)
class MAE(th.nn.Module):
def __init__(self, margin=2.0):
super(MAE, self).__init__()
self.margin = margin
def forward(self, input, output):
return th.mean((th.abs(input - output)))
class TripletLoss(th.nn.Module):
"""
Triplet loss
Takes embeddings of an anchor sample, a positive sample and a negative sample
"""
def __init__(self, margin):
super(TripletLoss, self).__init__()
self.margin = margin
#def forward(self, distance_positive, distance_negative, size_average=True):
# #distance_positive = distance_positive.pow(2).sum(0) # .pow(.5)
# #distance_negative = distance_negative.pow(2).sum(0) # .pow(.5)
#
# losses = F.relu(distance_positive - distance_negative + self.margin)
# return losses.mean() if size_average else losses.sum()
def forward(self, anchor, positive, negative, size_average=True):
distance_positive = F.pairwise_distance(anchor, positive)
distance_negative = F.pairwise_distance(anchor, negative)
losses = F.relu(distance_positive - distance_negative + self.margin)
return losses.mean() if size_average else losses.sum()
import params
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#import seaborn as sns
#from imblearn.over_sampling import RandomOverSampler
#from imblearn.under_sampling import RandomUnderSampler
import os
# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import json
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import json
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
import itertools
def splitData(dataset):
"""Split data into test and train data"""
# split labeled data in test and train data
X_train, X_test = train_test_split(dataset, test_size=0.952)
#X_train, X_test = train_test_split(dataset, test_size=0.868)
#X_train, X_test = train_test_split(dataset, test_size=0.903)
#X_train.to_csv(os.path.join(params.save_directory, params.path_train_labeled_Data), index=False)
#X_test.to_csv(os.path.join(params.save_directory, params.path_test_Data), index=False)
#X_train.to_csv("dataset/Superconductivity_20_notnorm/train_norm.csv", index=True, index_label= "index")
#X_test.to_csv("dataset/Superconductivity_20_notnorm/test_norm.csv", index=True, index_label= "index")
#X_train.to_csv("dataset/UTKFACEBOTH/train2.csv", index=True)
#X_test.to_csv("dataset/UTKFACEBOTH/test2.csv", index=True)
return X_train, X_test
def pop_random(lst):
"""get random item from list"""
idx = random.randrange(0, len(lst))
return lst.pop(idx)
def createLabeledPairs(X_train):
"""Create Pairs between the labeled and unlabeled samples"""
pairs_label = []
lst = X_train.index.values.tolist()
#name_of_goal_variable = uncertainty
###each sample in one pair only
#while len(lst) > 0:
# rand1 = pop_random(lst)
# rand2 = pop_random(lst)
# distance = X_train.loc[rand1].critical_temp - X_train.loc[rand2].critical_temp
# pair_1 = (rand1, rand2, distance)
# pair_2 = (rand2, rand1, -distance)
# pairs_label.append(pair_1)
# pairs_label.append(pair_2)
#each sample paired with every other sample
for rand1, rand2 in itertools.combinations_with_replacement(lst, 2):
if rand1 == rand2:
continue
distance = X_train.loc[rand1].critical_temp - X_train.loc[rand2].critical_temp
pair_1 = (rand1,rand2, distance)
pair_2 = (rand2,rand1, -distance)
pairs_label.append(pair_1)
pairs_label.append(pair_2)
##take only the 5% with lowest difference as similar, 95% dissimilar
#threshold = np.quantile(np.asarray(pairs_label)[:, 2], 0.05)
#pairs_similar_index = np.where(np.asarray(pairs_label)[:, 2] <= threshold) ##similar
#pairs_dissimilar_index = np.where(np.asarray(pairs_label)[:, 2] > threshold) ##dissimilar
#
#pairs_similar = np.take(pairs_label, pairs_similar_index, axis=0)[0, :, :2]
#pairs_dissimilar = np.take(pairs_label, pairs_dissimilar_index, axis=0)[0, :, :2]
#
# ##ones for dissimilar, 0 similar based on contrastive loss definition
#pairs_dissimilar = np.append(pairs_dissimilar, np.ones((len(pairs_dissimilar), 0)), axis=0)
#pairs_similar = np.append(pairs_similar, np.zeros((len(pairs_similar), 0)), axis=0)
#
#pairs_labeled_both = np.concatenate((pairs_dissimilar, pairs_similar))
#return pairs_labeled_both, pairs_similar, pairs_dissimilar
return pairs_label
def createUnlabeledPairs(X_train, X_train_u):
"""Create Pairs between the labeled and unlabeled samples"""
lst_l = X_train.index.values.tolist()
lst_u = X_train_u.index.values.tolist()
pairs_unlabel = []
while len(lst_l) > 1:
rand1 = pop_random(lst_l)
rand2 = pop_random(lst_u)
distance = np.linalg.norm(X_train.loc[rand1] - X_train_u.loc[rand2])
pair = rand1, rand2, distance
pairs_unlabel.append(pair)
lst_u = X_train_u.index.values.tolist()
#while len(lst_u) > 1:
# rand1 = pop_random(lst_u)
# rand2 = pop_random(lst_u)
# distance = np.linalg.norm(X_train_u.loc[rand1] - X_train_u.loc[rand2])
# pair = rand1, rand2, distance
# pairs_unlabel.append(pair)
for rand1, rand2 in itertools.combinations_with_replacement(lst_u, 2):
distance = np.linalg.norm(X_train_u.loc[rand1] - X_train_u.loc[rand2])
pair = rand1, rand2, distance
pairs_unlabel.append(pair)
##take only the 5% with lowest difference as similar, 95% dissimilar
threshold = np.quantile(np.asarray(pairs_unlabel)[:, 2], 0.05)
#threshold = 0
Paires_unlabeled_similar_index = np.where(np.asarray(pairs_unlabel)[:, 2] <= threshold) ##similar
Paires_unlabeled_dissimilar_index = np.where(np.asarray(pairs_unlabel)[:, 2] > threshold) ##dissimilar
pairs_unlabeled_similar = np.take(pairs_unlabel, Paires_unlabeled_similar_index, axis=0)[0, :, :2]
pairs_unlabeled_dissimilar = np.take(pairs_unlabel, Paires_unlabeled_dissimilar_index, axis=0)[0, :, :2]
pairs_unlabeled_dissimilar = np.append(pairs_unlabeled_dissimilar, np.ones((len(pairs_unlabeled_dissimilar), 1)),
axis=1)
pairs_unlabeled_similar = np.append(pairs_unlabeled_similar, np.zeros((len(pairs_unlabeled_similar), 1)), axis=1)
pairs_unlabeled_both = np.concatenate((pairs_unlabeled_dissimilar, pairs_unlabeled_similar))
##ones for dissimilar, 0 similar based on contrastive loss definition
return pairs_unlabeled_both, pairs_unlabeled_similar, pairs_unlabeled_dissimilar
def Oversampling(pairs_labeled, pairs_unlabeled):
"""Random Oversampling
duplicate random samples"""
number_U = len(pairs_unlabeled)
number_L = len(pairs_labeled)
pairs_labeled_oversampling = pairs_labeled.copy().tolist()
while number_U > len(pairs_labeled_oversampling):
idx = random.randrange(0, number_L)
pairs_labeled_oversampling.append(pairs_labeled[idx].tolist())
return np.asarray(pairs_labeled_oversampling)
def UnderSampling(pairs_similar, pairs_dissimilar):
"""Random Undersampling
remove random samples"""
number_S = len(pairs_similar)
df = pd.DataFrame(data=pairs_dissimilar, columns=["column1", "column2", "column3"])
pairs_diss_undersampled = df.sample(number_S)
return pairs_diss_undersampled.to_numpy()
def main():
##read and scale data and save for later
path_data_labeled = os.path.join(params.save_directory,params.path_features_labeled_scaled)
path_data_unlabeled = os.path.join(params.save_directory,params.path_features_unlabeled_scaled)
data_labeled = pd.read_csv(path_data_labeled,
na_values='?', comment='\t',
sep=',', header=0)
data_unlabeled = pd.read_csv(path_data_unlabeled,
na_values='?', comment='\t',
sep=',', header=0)
train_labeled, test_labeled = splitData(data_labeled)
# ##drop colum with final variable like critical temp or uncertainty for the unlabeled data
# data_unlabeled = data_unlabeled.drop(columns=[params.name_of_goal_variable])
#
# ##create pairs labeled
# pairs_labeled = createLabeledPairs(train_labeled)
#
# ##drop for labeled also to build pairs with unlabeled samples
# data_l = train_labeled.drop(columns=[params.name_of_goal_variable])
#
# pairs_unlabeled, pairs_unlabeled_similar, pairs_unlabeled_dissimilar = createUnlabeledPairs(data_l,
# data_unlabeled, )
# ###oversample labeled pairs until |labeled| == |unlabeled|
# P_L = Oversampling(pairs_labeled, pairs_unlabeled)
#
# ##split labeled pairs in dissimilar and similar pairs
# S_L_index = np.where(np.asarray(P_L)[:, 2] == 0) ## get indices for similar
# U_L_index = np.where(np.asarray(P_L)[:, 2] > 0) ## get indices fordissimilar
#
# S_L = np.take(P_L, S_L_index, axis=0)[0, :] # similar labeled pairs
# D_L = np.take(P_L, U_L_index, axis=0)[0, :] # dissimilar labeled pairs
#
# D_LU = np.concatenate((pairs_unlabeled_dissimilar, D_L))
# S_LU = np.concatenate((pairs_unlabeled_similar, S_L))
#
# ##undersampling until |similar| == |dissimilar>
# D_LU = UnderSampling(S_LU, D_LU)
#
# ##concatenate and finished
# P = np.concatenate((S_LU, D_LU))
#
# ##save final pairs
# np.savetxt(os.path.join(params.save_directory, params.path_pairs), P, delimiter=",")
#
def selectmostdiversSamples(dataframe):
first_Sample = dataframe.sample(1, replace= False).index
dataframe = dataframe.drop(columns="critical_temp")
anchor_list = first_Sample[0]
list_differences = []
for index, row in dataframe.iterrows():
list_differences = np.linalg.norm(row - dataframe.iloc[first_Sample])
a = np.argmax(list_differences)
#
def selectAnchors(dataframe):
first_Sample = dataframe.sample(1, replace= False).index
dataframe = dataframe.drop(columns="critical_temp")
anchor_list = first_Sample[0]
list_differences = []
length = len(dataframe.index)
anchor_indices = np.zeros(10, dtype=np.int)
first = np.random.randint(low=0, high=length, size=1)
anchor_indices[0] = first
for i in range(1, 10):
local_max = np.zeros(length)
# Get the already selected anchors.
selected_anchors = anchor_indices[:i]
for j in range(length):
# Skip already selected anchors.
if j in selected_anchors:
continue
# Get the maximum distance from all already selected anchors.
local_max[j] = np.sum(np.linalg.norm(dataframe.iloc[selected_anchors] - np.tile(dataframe.iloc[j], (i,1)), axis = 1))
# Add the most diverse anchor to the selection.
anchor_indices[i] = np.argmax(local_max)
return anchor_indices
def getBins(dataframe, num_samples):
data_sort = dataframe.sort_values("critical_temp", ascending=True, ignore_index=True)
sort_index = int(np.ceil(len(data_sort) / num_samples))
#train_labeled = data_sort[data_sort.index % sort_index == 0]
bins = (np.exp(np.log(130)*np.arange(1,num_samples+1)/num_samples)-1).tolist()
#bins = train_labeled["critical_temp"].reset_index(drop=True)
representive_samples = []
a = data_sort[data_sort.critical_temp <= bins[0]].sample(1).index
representive_samples.append(a[0])
for i in range(1,num_samples-1):
a = data_sort[(data_sort.critical_temp >= bins[i]) & (data_sort.critical_temp <= bins[i+1])].sample(1).index
representive_samples.append(a[0])
a = data_sort[(data_sort.critical_temp >= bins[num_samples-1])].sample(1).index
representive_samples.append(a[0])
train_labeled_new = data_sort.iloc[representive_samples]
rest = data_sort.drop(train_labeled_new.index)
return train_labeled_new, rest
def main_2():
##read and scale data and save for later
path_data_labeled = "dataset/UTKFACEBOTH/labels_crop.csv"
path_data_labeled = "dataset/Superconductivity_small_20/superconductivity.csv"
data_labeled = pd.read_csv(path_data_labeled,
na_values='?', comment='\t',
sep=',', header=0)
#data_labeled = data_labeled[data_labeled.critical_temp < 20]
#data_labeled = data_labeled.set_index("index")
columns = data_labeled.columns
columnscale = columns[:-1]
from sklearn.preprocessing import StandardScaler
# create a scaler object
#std_scaler = StandardScaler()
# fit and transform the data
#data_labeled[columnscale] = std_scaler.fit_transform(data_labeled[columnscale])
min_max_scaler = MinMaxScaler()
data_labeled[columnscale] = min_max_scaler.fit_transform(data_labeled[columnscale])
#from sklearn.model_selection import StratifiedShuffleSplit
#ss1 = StratifiedShuffleSplit(test_size=0.999)
#ss1split = ss1.split(data_labeled, data_labeled["critical_temp"].round(-2))
#train_index, test_index = next(ss1split)
#train_labeled = data_labeled.iloc[train_index]
#train_unlabeled = data_labeled.iloc[test_index]
##load boston dataset
#from sklearn.datasets import load_boston
#boston = load_boston()
# print("Type of boston dataset:", type(boston))
# # A bunch is you remember is a dictionary based dataset. Dictionaries are addressed by keys.
# # Let's look at the keys.
# print(boston.keys())
#
# # DESCR sounds like it could be useful. Let's print the description.
# print(boston['DESCR'])
#
# # Let's change the data to a Panda's Dataframe
# import pandas as pd
# boston_df = pd.DataFrame(boston['data'])
# boston_df.head()
# #
# # # Now add the column names.
# boston_df.columns = boston['feature_names']
# boston_df.head()
# #
# # # Add the target as PRICE.
# boston_df['PRICE'] = boston['target']
# boston_df.head()
# columnscale = boston_df.columns[:-0]
#
# min_max_scaler = MinMaxScaler()
# boston_df[columnscale] = min_max_scaler.fit_transform(boston_df[columnscale])
#
# train, test_labeled = train_test_split(boston_df, test_size=0.2)
# train_unlabeled, train_labeled = train_test_split(train, test_size=0.0)
# test_labeled, evaluation = train_test_split(test_labeled, test_size=0.0)
#train, test_labeled = splitData(data_labeled)
#train_unlabeled, train_labeled = train_test_split(train, test_size=0.019)
#test_labeled, evaluation = train_test_split(test_labeled, test_size=0.00095)
for i in [5,10,20,50,100,500]:
for j in range(5):
start_path = "dataset_double_random_different_num_labeled_multiple"
directory_name = "{}".format(i)
directory_name_2 = "{}".format(j)
if not os.path.exists(os.path.join(start_path, directory_name,directory_name_2)):
os.makedirs(os.path.join(start_path, directory_name,directory_name_2))
base_path = os.path.join(start_path, directory_name,directory_name_2)
#data_sort = data_labeled.sort_values("critical_temp", ascending= True, ignore_index = True)
#sort_index = int(np.ceil(len(data_sort)/20))
#train_labeled, rest = getBins(data_labeled, i)
#train_labeled = data_labeled.sample(20)
#rest = data_labeled.drop(train_labeled.index)
#train_labeled = data_sort[data_sort.index % sort_index == 0]
#rest = data_sort[data_sort.index % sort_index != 0]
#
#evaluation = rest[rest.index % (sort_index-0) == 0]
#rest_labeled = rest[rest.index % (sort_index-0) != 0]