Commit b2730478 authored by seanlabor's avatar seanlabor
Browse files

temp and nn

parent 62ad7ede
......@@ -117,6 +117,7 @@ epsilon_end=0.1,num_actions=env.num_actions,
if payoffs[0]==1:
landlord_wins+=1
# Reorganaize the data to be state, action, reward, next_state, done
trajectories = reorganize(trajectories, payoffs)
# Feed transitions into agent memory, and train the agent
......@@ -160,6 +161,8 @@ epsilon_end=0.1,num_actions=env.num_actions,
print("landlord_wins:",landlord_wins)
print("episodes:", episode_)
print("avreward:",avreward_)
from torchinfo import summary
print(summary(agent[0].q_estimator.qnet))
os.remove(save_temp)
#Logging
......
......@@ -13,6 +13,7 @@ import torch.nn.functional as F
import torch.optim as optim
from IPython.display import clear_output
from utilitys import util_save_paths
import pickle
parser = argparse.ArgumentParser("RAYN DQN application in Doudizhu")
parser.add_argument('--num_episodes', type=int, default=100)
......@@ -89,25 +90,16 @@ class Network(nn.Module):
"""Initialization."""
super(Network, self).__init__()
self.num_actions = 27472
self.state_shape = [790]
self.layers = nn.Sequential(
nn.Linear(790, 64, bias=True),
nn.Tanh(),
nn.Linear(64, 64, bias=True),
nn.Tanh(),
nn.Linear(64, out_dim, bias=True)
)
layer_dims = [np.prod(self.state_shape)]
#print("layer_dims:",layer_dims)
fc = [nn.Flatten()]
fc.append(nn.BatchNorm1d(layer_dims[0]))
for i in range(len(layer_dims)-1):
fc.append(nn.Linear(layer_dims[i], layer_dims[i+1], bias=True))
fc.append(nn.Tanh())
fc.append(nn.Linear(layer_dims[-1], self.num_actions, bias=True))
self.layers = nn.Sequential(*fc)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Forward method implementation."""
......@@ -282,7 +274,35 @@ class DQNAgent:
score = 0
hilf=True
temp_loaded=False
#load temp variables
if os.path.exists(save_temp):
(self.episode,
self.epsilon,
update_cnt,
self.scores,
self.losses,
self.epsilons,
self.average,
self.not_logged_yet)=pickle.load(open(save_temp,"rb"))
print('\n',"==============")
print("Temporary variables loaded:")
print("start episode:", self.episode)
print("avreward:", self.average[-1])
print("psilon_start: ", self.epsilons[-1])
print('\n',"==============")
temp_loaded=True
while self.episode < num_episodes :
......@@ -329,22 +349,36 @@ class DQNAgent:
if update_cnt % self.target_update == 0:
self._target_hard_update()
# plotting
# logging
if self.episode % evaluate_every == 0 and self.episode !=0 and hilf:
hilf=False
self.average.append(sum(self.scores)/len(self.scores))
print("logging episode: ", self.episode)
print("average winratio: ",self.average[-1])
#self._plot(frame_idx, scores, losses, epsilons)
tpe = (time.time()-start)/evaluate_every
start=time.time()
self.logging()
self._plot(num_episodes, agent.average, agent.losses, agent.epsilons)
print("time per episode:", tpe)
#print(losses)
#torch.save({'model_state_dict': self.dqn.state_dict(),'optimizer_state_dict': self.optimizer.state_dict()}, save_model)
print("Modell saved")
print("================")
if temp_loaded:
temp_loaded=False
else:
hilf=False
self.average.append(sum(self.scores)/len(self.scores))
print("logging episode: ", self.episode)
print("average winratio: ",self.average[-1])
#self._plot(frame_idx, scores, losses, epsilons)
tpe = (time.time()-start)/evaluate_every
start=time.time()
self.logging()
self._plot(num_episodes, agent.average, agent.losses, agent.epsilons)
print("time per episode:", tpe)
torch.save({'model_state_dict': self.dqn.state_dict(),'optimizer_state_dict': self.optimizer.state_dict()}, save_model)
print("Modell saved")
#save temp variables
pickle.dump([self.episode,
self.epsilon,
update_cnt,
self.scores,
self.losses,
self.epsilons,
self.average,
self.not_logged_yet],
open(save_temp, "wb"))
print("temp variables saved")
print("================")
def logging (self):
from torchinfo import summary
......@@ -352,7 +386,9 @@ class DQNAgent:
file1 = open(save_txt,"a", encoding="utf-8")
if self.not_logged_yet:
print("logging first time")
file1.write("-----New Run-------")
file1.write('\n')
file1.write("=======New Run========")
file1.write('\n')
file1.write('\n')
file1.write("Parameters:")
lines=["memory_size: {}".format(memory_size),"batch_size: {}".format(batch_size),"target_update: {}".format(target_update),"epsilon_decay: {}".format(epsilon_decay),"min_epsilon: {}".format(self.min_epsilon),"max_epsilon: {}".format(self.max_epsilon),"gamma: {}".format(self.gamma), "neural net structure: {}".format(summary(self.dqn))]
......@@ -504,6 +540,7 @@ agent = DQNAgent(env, memory_size, batch_size, target_update, epsilon_decay,max_
agent.train(num_episodes)
agent._plot(num_episodes, agent.average, agent.losses, agent.epsilons)
os.remove(save_temp)
#frames = agent.test()
......
import sys
import os
import time
import argparse
from typing import Dict, List, Tuple
from datetime import datetime
import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from IPython.display import clear_output
from utilitys import util_save_paths
parser = argparse.ArgumentParser("RAYN DQN application in Doudizhu")
parser.add_argument('--num_episodes', type=int, default=100)
parser.add_argument('--evaluate_every', type=int, default=10)
args = parser.parse_args()
#Specify path variables
googledrive_path="/content/drive/MyDrive/Google_Colab/rlcards/experiments"
local_path="C:/Users/Flo/Documents/Uni/Masterarbeit/venv/latest/experiments/"
num_episodes = args.num_episodes
evaluate_every=args.evaluate_every
memory_size = 20000
batch_size = 32
target_update = 100
epsilon_decay = 1 / 20000
learning_rate=0.00005
max_epsilon= 1.0
min_epsilon= 0.1
gamma=0.99
'''returns full save path variables
save_model: torch model path
save_path: for log file
save_temp: save temp files (not implemented here)
'''
save_path,save_model,save_temp=util_save_paths(googledrive_path, local_path, __file__)
class ReplayBuffer:
"""A simple numpy replay buffer."""
def __init__(self, obs_dim: int, size: int, batch_size: int = 32):
self.obs_buf = np.zeros([size, obs_dim], dtype=np.float32)
self.next_obs_buf = np.zeros([size, obs_dim], dtype=np.float32)
self.acts_buf = np.zeros([size], dtype=np.float32)
self.rews_buf = np.zeros([size], dtype=np.float32)
self.done_buf = np.zeros(size, dtype=np.float32)
self.max_size, self.batch_size = size, batch_size
self.ptr, self.size, = 0, 0
def store(
self,
obs: np.ndarray,
act: np.ndarray,
rew: float,
next_obs: np.ndarray,
done: bool,
):
self.obs_buf[self.ptr] = obs
self.next_obs_buf[self.ptr] = next_obs
self.acts_buf[self.ptr] = act
self.rews_buf[self.ptr] = rew
self.done_buf[self.ptr] = done
self.ptr = (self.ptr + 1) % self.max_size
self.size = min(self.size + 1, self.max_size)
def sample_batch(self) -> Dict[str, np.ndarray]:
idxs = np.random.choice(self.size, size=self.batch_size, replace=False)
return dict(obs=self.obs_buf[idxs],
next_obs=self.next_obs_buf[idxs],
acts=self.acts_buf[idxs],
rews=self.rews_buf[idxs],
done=self.done_buf[idxs])
def __len__(self) -> int:
return self.size
class Network(nn.Module):
def __init__(self, in_dim: int, out_dim: int):
"""Initialization."""
super(Network, self).__init__()
self.layers = nn.Sequential(
nn.Linear(790, 64, bias=True),
nn.Tanh(),
nn.Linear(64, 64, bias=True),
nn.Tanh(),
nn.Linear(64, out_dim, bias=True)
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Forward method implementation."""
return self.layers(x)
# class Network(nn.Module):
# def __init__(self, in_dim: int, out_dim: int):
# """Initialization."""
# super(Network, self).__init__()
#
#
#
#
#
#
#
# self.num_actions = 27472
# self.state_shape = [790]
# self.mlp_layers = [64, 64]
# print(self.num_actions,self.state_shape,self.mlp_layers )
#
# layer_dims = [np.prod(self.state_shape)] + self.mlp_layers
# print("layer_dims:",layer_dims)
# fc = [nn.Flatten()]
# fc.append(nn.BatchNorm1d(layer_dims[0]))
# for i in range(len(layer_dims)-1):
# fc.append(nn.Linear(layer_dims[i], layer_dims[i+1], bias=True))
# fc.append(nn.Tanh())
# fc.append(nn.Linear(layer_dims[-1], self.num_actions, bias=True))
# self.layers = nn.Sequential(*fc)
#
# def forward(self, x: torch.Tensor) -> torch.Tensor:
# """Forward method implementation."""
# return self.layers(x)
class DQNAgent:
"""DQN Agent interacting with environment.
Attribute:
env (gym.Env): openAI Gym environment
memory (ReplayBuffer): replay memory to store transitions
batch_size (int): batch size for sampling
epsilon (float): parameter for epsilon greedy policy
epsilon_decay (float): step size to decrease epsilon
max_epsilon (float): max value of epsilon
min_epsilon (float): min value of epsilon
target_update (int): period for target model's hard update
gamma (float): discount factor
dqn (Network): model to train and select actions
dqn_target (Network): target model to update
optimizer (torch.optim): optimizer for training dqn
transition (list): transition information including
state, action, reward, next_state, done
"""
def __init__(
self,
env,
memory_size: int,
batch_size: int,
target_update: int,
epsilon_decay: float,
max_epsilon: float,
min_epsilon: float,
gamma: float,
):
"""Initialization.
Args:
env (gym.Env): openAI Gym environment
memory_size (int): length of memory
batch_size (int): batch size for sampling
target_update (int): period for target model's hard update
epsilon_decay (float): step size to decrease epsilon
lr (float): learning rate
max_epsilon (float): max value of epsilon
min_epsilon (float): min value of epsilon
gamma (float): discount factor
"""
#obs_dim = env.observation_space.shape[0]
obs_dim = 790
#action_dim = env.action_space.n
action_dim=27472
self.env = env
self.memory = ReplayBuffer(obs_dim, memory_size, batch_size)
self.batch_size = batch_size
self.epsilon = max_epsilon
self.epsilon_decay = epsilon_decay
self.max_epsilon = max_epsilon
self.min_epsilon = min_epsilon
self.target_update = target_update
self.gamma = gamma
self.scores=[]
self.losses = []
self.epsilons = []
self.average =[]
self.epi_loss_tracker=[]
self.episode=0
self.not_logged_yet=True
# device: cpu / gpu
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu"
)
print(self.device)
# networks: dqn, dqn_target
self.dqn = Network(obs_dim, action_dim).to(self.device)
self.dqn_target = Network(obs_dim, action_dim).to(self.device)
self.dqn_target.load_state_dict(self.dqn.state_dict())
self.dqn_target.eval()
from torchinfo import summary
print(summary(self.dqn))
# optimizer
self.optimizer = optim.Adam(self.dqn.parameters(), lr=learning_rate)
# if os.path.exists(save_model):
# if not torch.cuda.is_available():
# checkpoint = torch.load(save_model, map_location=torch.device('cpu'))
# else:
# checkpoint = torch.load(save_model)
# self.dqn.load_state_dict(checkpoint['model_state_dict'])
# self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# print("NN parameters loaded from file")
# transition to store in memory
self.transition = list()
# mode: train / test
self.is_test = False
def select_action(self, state: np.ndarray) -> np.ndarray:
"""Select an action from the input state."""
# epsilon greedy policy
#print(state)
if self.epsilon > np.random.random():
selected_action = self.env.action_space_sample()
#print("random action taken:", selected_action)
else:
legal_actions = list(env._get_legal_actions().keys())
#print(legal_actions)
selected_action = self.dqn(
torch.FloatTensor(state).to(self.device)
)
selected_action = selected_action.detach().cpu().numpy()
liste=list(selected_action)
result = [liste[i] for i in legal_actions]
selected_action = liste.index(max(result))
#print("selected_action taken:", selected_action)
if not self.is_test:
self.transition = [state, selected_action]
return selected_action
def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]:
"""Take an action and return the response of the env.
Doudizhu2 rlcard env returns sometimes empty states if other players finish round. Therefore nonestate(bool) need to be checked. Feeding None state into NN causes breakage (NaN)
"""
next_state, reward, done, nonestate = self.env.step(action)
if not self.is_test and not nonestate:
self.transition += [reward, next_state, done]
#print(self.transition)
self.memory.store(*self.transition)
return next_state, reward, done, nonestate
def update_model(self) -> torch.Tensor:
"""Update the model by gradient descent."""
samples = self.memory.sample_batch()
#print(samples)
loss = self._compute_dqn_loss(samples)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
return loss.item()
def train(self, num_frames: int, plotting_interval: int = 200):
start=time.time()
"""Train the agent."""
self.is_test = False
state = self.env.reset()
update_cnt = 0
score = 0
hilf=True
while self.episode < num_episodes :
action = self.select_action(state)
#print(action)
next_state, reward, done, nonestate = self.step(action)
state = next_state
score += reward
# if episode ends
if done:
state = self.env.reset()
#print(state)
self.episode+=1
self.scores.append(score)
score = 0
hilf=True
elif nonestate:
state = self.env.reset()
#print(state)
self.episode+=1
self.scores.append(score)
score = 0
hilf=True
# if training is ready
if len(self.memory) >= self.batch_size:
loss = self.update_model()
self.losses.append(loss)
update_cnt += 1
# linearly decrease epsilon
self.epsilon = max(
self.min_epsilon, self.epsilon - (
self.max_epsilon - self.min_epsilon
) * self.epsilon_decay
)
self.epsilons.append(self.epsilon)
# if hard update is needed
if update_cnt % self.target_update == 0:
self._target_hard_update()
# plotting
if self.episode % evaluate_every == 0 and self.episode !=0 and hilf:
hilf=False
self.average.append(sum(self.scores)/len(self.scores))
print("logging episode: ", self.episode)
print("average winratio: ",self.average[-1])
#self._plot(frame_idx, scores, losses, epsilons)
tpe = (time.time()-start)/evaluate_every
start=time.time()
self.logging()
self._plot(num_episodes, agent.average, agent.losses, agent.epsilons)
print("time per episode:", tpe)
#print(losses)
#torch.save({'model_state_dict': self.dqn.state_dict(),'optimizer_state_dict': self.optimizer.state_dict()}, save_model)
print("Modell saved")
print("================")
def logging (self):
from torchinfo import summary
save_txt = os.path.join(save_path, 'log_DQN.txt')
file1 = open(save_txt,"a", encoding="utf-8")
if self.not_logged_yet:
print("logging first time")
file1.write("-----New Run-------")
file1.write('\n')
file1.write("Parameters:")
lines=["memory_size: {}".format(memory_size),"batch_size: {}".format(batch_size),"target_update: {}".format(target_update),"epsilon_decay: {}".format(epsilon_decay),"min_epsilon: {}".format(self.min_epsilon),"max_epsilon: {}".format(self.max_epsilon),"gamma: {}".format(self.gamma), "neural net structure: {}".format(summary(self.dqn))]
#lines=[memory_size,batch_size,target_update,epsilon_decay,self.min_epsilon,self.max_epsilon,self.gamma,datetime.now(),]
for line in lines:
file1.write('\n')
file1.write(str(line))
file1.write('\n')
file1.write("logged variables: self.episode, self.average[-1],datetime.now():")
lines=["Episode: {}".format(self.episode), "Average: {}".format(self.average[-1]),"Time: {}".format(datetime.now()),"-------"]
for line in lines:
file1.write('\n')
file1.write(str(line))
file1.close()
##Losses
save_logs = os.path.join(save_path, 'log_loss.txt')
file2 = open(save_logs,"a")
if self.not_logged_yet:
file2.write("episode/num_episodes, self.losses, datetime.now()")
self.not_logged_yet=False
file2.write('\n')
file2.write("{}/{}".format(self.episode, num_episodes))
lines=[self.losses, datetime.now(),"-------"]
for line in lines:
file2.write('\n')
file2.write(str(line))
file2.close()
def test(self) -> List[np.ndarray]:
"""Test the agent."""
self.is_test = True