Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
seanlabor
rlcards
Commits
b2730478
Commit
b2730478
authored
Nov 29, 2021
by
seanlabor
Browse files
temp and nn
parent
62ad7ede
Changes
11
Hide whitespace changes
Inline
Side-by-side
examples/run_rl_RAYN_DQN_PER.py
→
examples/
archiv/
run_rl_RAYN_DQN_PER.py
View file @
b2730478
File moved
examples/run_rl_RAYN_DQN_cat.py
→
examples/
archiv/
run_rl_RAYN_DQN_cat.py
View file @
b2730478
File moved
examples/run_rl_RAYN_DQN_dueling.py
→
examples/
archiv/
run_rl_RAYN_DQN_dueling.py
View file @
b2730478
File moved
examples/run_rl_RAYN_DQN_noisynet.py
→
examples/
archiv/
run_rl_RAYN_DQN_noisynet.py
View file @
b2730478
File moved
examples/run_rl_RAYN_DQN_nstep.py
→
examples/
archiv/
run_rl_RAYN_DQN_nstep.py
View file @
b2730478
File moved
examples/run_rl_RAYN_DoubleDQN.py
→
examples/
archiv/
run_rl_RAYN_DoubleDQN.py
View file @
b2730478
File moved
examples/run_rl_DQN.py
View file @
b2730478
...
...
@@ -117,6 +117,7 @@ epsilon_end=0.1,num_actions=env.num_actions,
if
payoffs
[
0
]
==
1
:
landlord_wins
+=
1
# Reorganaize the data to be state, action, reward, next_state, done
trajectories
=
reorganize
(
trajectories
,
payoffs
)
# Feed transitions into agent memory, and train the agent
...
...
@@ -160,6 +161,8 @@ epsilon_end=0.1,num_actions=env.num_actions,
print
(
"landlord_wins:"
,
landlord_wins
)
print
(
"episodes:"
,
episode_
)
print
(
"avreward:"
,
avreward_
)
from
torchinfo
import
summary
print
(
summary
(
agent
[
0
].
q_estimator
.
qnet
))
os
.
remove
(
save_temp
)
#Logging
...
...
examples/run_rl_RAYN_DQN.py
View file @
b2730478
...
...
@@ -13,6 +13,7 @@ import torch.nn.functional as F
import
torch.optim
as
optim
from
IPython.display
import
clear_output
from
utilitys
import
util_save_paths
import
pickle
parser
=
argparse
.
ArgumentParser
(
"RAYN DQN application in Doudizhu"
)
parser
.
add_argument
(
'--num_episodes'
,
type
=
int
,
default
=
100
)
...
...
@@ -89,25 +90,16 @@ class Network(nn.Module):
"""Initialization."""
super
(
Network
,
self
).
__init__
()
self
.
num_actions
=
27472
self
.
state_shape
=
[
790
]
self
.
layers
=
nn
.
Sequential
(
nn
.
Linear
(
790
,
64
,
bias
=
True
),
nn
.
Tanh
(),
nn
.
Linear
(
64
,
64
,
bias
=
True
),
nn
.
Tanh
(),
nn
.
Linear
(
64
,
out_dim
,
bias
=
True
)
)
layer_dims
=
[
np
.
prod
(
self
.
state_shape
)]
#print("layer_dims:",layer_dims)
fc
=
[
nn
.
Flatten
()]
fc
.
append
(
nn
.
BatchNorm1d
(
layer_dims
[
0
]))
for
i
in
range
(
len
(
layer_dims
)
-
1
):
fc
.
append
(
nn
.
Linear
(
layer_dims
[
i
],
layer_dims
[
i
+
1
],
bias
=
True
))
fc
.
append
(
nn
.
Tanh
())
fc
.
append
(
nn
.
Linear
(
layer_dims
[
-
1
],
self
.
num_actions
,
bias
=
True
))
self
.
layers
=
nn
.
Sequential
(
*
fc
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Forward method implementation."""
...
...
@@ -282,7 +274,35 @@ class DQNAgent:
score
=
0
hilf
=
True
temp_loaded
=
False
#load temp variables
if
os
.
path
.
exists
(
save_temp
):
(
self
.
episode
,
self
.
epsilon
,
update_cnt
,
self
.
scores
,
self
.
losses
,
self
.
epsilons
,
self
.
average
,
self
.
not_logged_yet
)
=
pickle
.
load
(
open
(
save_temp
,
"rb"
))
print
(
'
\n
'
,
"=============="
)
print
(
"Temporary variables loaded:"
)
print
(
"start episode:"
,
self
.
episode
)
print
(
"avreward:"
,
self
.
average
[
-
1
])
print
(
"psilon_start: "
,
self
.
epsilons
[
-
1
])
print
(
'
\n
'
,
"=============="
)
temp_loaded
=
True
while
self
.
episode
<
num_episodes
:
...
...
@@ -329,22 +349,36 @@ class DQNAgent:
if
update_cnt
%
self
.
target_update
==
0
:
self
.
_target_hard_update
()
#
p
lo
tt
ing
# lo
gg
ing
if
self
.
episode
%
evaluate_every
==
0
and
self
.
episode
!=
0
and
hilf
:
hilf
=
False
self
.
average
.
append
(
sum
(
self
.
scores
)
/
len
(
self
.
scores
))
print
(
"logging episode: "
,
self
.
episode
)
print
(
"average winratio: "
,
self
.
average
[
-
1
])
#self._plot(frame_idx, scores, losses, epsilons)
tpe
=
(
time
.
time
()
-
start
)
/
evaluate_every
start
=
time
.
time
()
self
.
logging
()
self
.
_plot
(
num_episodes
,
agent
.
average
,
agent
.
losses
,
agent
.
epsilons
)
print
(
"time per episode:"
,
tpe
)
#print(losses)
#torch.save({'model_state_dict': self.dqn.state_dict(),'optimizer_state_dict': self.optimizer.state_dict()}, save_model)
print
(
"Modell saved"
)
print
(
"================"
)
if
temp_loaded
:
temp_loaded
=
False
else
:
hilf
=
False
self
.
average
.
append
(
sum
(
self
.
scores
)
/
len
(
self
.
scores
))
print
(
"logging episode: "
,
self
.
episode
)
print
(
"average winratio: "
,
self
.
average
[
-
1
])
#self._plot(frame_idx, scores, losses, epsilons)
tpe
=
(
time
.
time
()
-
start
)
/
evaluate_every
start
=
time
.
time
()
self
.
logging
()
self
.
_plot
(
num_episodes
,
agent
.
average
,
agent
.
losses
,
agent
.
epsilons
)
print
(
"time per episode:"
,
tpe
)
torch
.
save
({
'model_state_dict'
:
self
.
dqn
.
state_dict
(),
'optimizer_state_dict'
:
self
.
optimizer
.
state_dict
()},
save_model
)
print
(
"Modell saved"
)
#save temp variables
pickle
.
dump
([
self
.
episode
,
self
.
epsilon
,
update_cnt
,
self
.
scores
,
self
.
losses
,
self
.
epsilons
,
self
.
average
,
self
.
not_logged_yet
],
open
(
save_temp
,
"wb"
))
print
(
"temp variables saved"
)
print
(
"================"
)
def
logging
(
self
):
from
torchinfo
import
summary
...
...
@@ -352,7 +386,9 @@ class DQNAgent:
file1
=
open
(
save_txt
,
"a"
,
encoding
=
"utf-8"
)
if
self
.
not_logged_yet
:
print
(
"logging first time"
)
file1
.
write
(
"-----New Run-------"
)
file1
.
write
(
'
\n
'
)
file1
.
write
(
"=======New Run========"
)
file1
.
write
(
'
\n
'
)
file1
.
write
(
'
\n
'
)
file1
.
write
(
"Parameters:"
)
lines
=
[
"memory_size: {}"
.
format
(
memory_size
),
"batch_size: {}"
.
format
(
batch_size
),
"target_update: {}"
.
format
(
target_update
),
"epsilon_decay: {}"
.
format
(
epsilon_decay
),
"min_epsilon: {}"
.
format
(
self
.
min_epsilon
),
"max_epsilon: {}"
.
format
(
self
.
max_epsilon
),
"gamma: {}"
.
format
(
self
.
gamma
),
"neural net structure: {}"
.
format
(
summary
(
self
.
dqn
))]
...
...
@@ -504,6 +540,7 @@ agent = DQNAgent(env, memory_size, batch_size, target_update, epsilon_decay,max_
agent
.
train
(
num_episodes
)
agent
.
_plot
(
num_episodes
,
agent
.
average
,
agent
.
losses
,
agent
.
epsilons
)
os
.
remove
(
save_temp
)
#frames = agent.test()
...
...
examples/run_rl_RAYN_DQN_DoubleDQN.py
0 → 100644
View file @
b2730478
import
sys
import
os
import
time
import
argparse
from
typing
import
Dict
,
List
,
Tuple
from
datetime
import
datetime
import
gym
import
matplotlib.pyplot
as
plt
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.optim
as
optim
from
IPython.display
import
clear_output
from
utilitys
import
util_save_paths
parser
=
argparse
.
ArgumentParser
(
"RAYN DQN application in Doudizhu"
)
parser
.
add_argument
(
'--num_episodes'
,
type
=
int
,
default
=
100
)
parser
.
add_argument
(
'--evaluate_every'
,
type
=
int
,
default
=
10
)
args
=
parser
.
parse_args
()
#Specify path variables
googledrive_path
=
"/content/drive/MyDrive/Google_Colab/rlcards/experiments"
local_path
=
"C:/Users/Flo/Documents/Uni/Masterarbeit/venv/latest/experiments/"
num_episodes
=
args
.
num_episodes
evaluate_every
=
args
.
evaluate_every
memory_size
=
20000
batch_size
=
32
target_update
=
100
epsilon_decay
=
1
/
20000
learning_rate
=
0.00005
max_epsilon
=
1.0
min_epsilon
=
0.1
gamma
=
0.99
'''returns full save path variables
save_model: torch model path
save_path: for log file
save_temp: save temp files (not implemented here)
'''
save_path
,
save_model
,
save_temp
=
util_save_paths
(
googledrive_path
,
local_path
,
__file__
)
class
ReplayBuffer
:
"""A simple numpy replay buffer."""
def
__init__
(
self
,
obs_dim
:
int
,
size
:
int
,
batch_size
:
int
=
32
):
self
.
obs_buf
=
np
.
zeros
([
size
,
obs_dim
],
dtype
=
np
.
float32
)
self
.
next_obs_buf
=
np
.
zeros
([
size
,
obs_dim
],
dtype
=
np
.
float32
)
self
.
acts_buf
=
np
.
zeros
([
size
],
dtype
=
np
.
float32
)
self
.
rews_buf
=
np
.
zeros
([
size
],
dtype
=
np
.
float32
)
self
.
done_buf
=
np
.
zeros
(
size
,
dtype
=
np
.
float32
)
self
.
max_size
,
self
.
batch_size
=
size
,
batch_size
self
.
ptr
,
self
.
size
,
=
0
,
0
def
store
(
self
,
obs
:
np
.
ndarray
,
act
:
np
.
ndarray
,
rew
:
float
,
next_obs
:
np
.
ndarray
,
done
:
bool
,
):
self
.
obs_buf
[
self
.
ptr
]
=
obs
self
.
next_obs_buf
[
self
.
ptr
]
=
next_obs
self
.
acts_buf
[
self
.
ptr
]
=
act
self
.
rews_buf
[
self
.
ptr
]
=
rew
self
.
done_buf
[
self
.
ptr
]
=
done
self
.
ptr
=
(
self
.
ptr
+
1
)
%
self
.
max_size
self
.
size
=
min
(
self
.
size
+
1
,
self
.
max_size
)
def
sample_batch
(
self
)
->
Dict
[
str
,
np
.
ndarray
]:
idxs
=
np
.
random
.
choice
(
self
.
size
,
size
=
self
.
batch_size
,
replace
=
False
)
return
dict
(
obs
=
self
.
obs_buf
[
idxs
],
next_obs
=
self
.
next_obs_buf
[
idxs
],
acts
=
self
.
acts_buf
[
idxs
],
rews
=
self
.
rews_buf
[
idxs
],
done
=
self
.
done_buf
[
idxs
])
def
__len__
(
self
)
->
int
:
return
self
.
size
class
Network
(
nn
.
Module
):
def
__init__
(
self
,
in_dim
:
int
,
out_dim
:
int
):
"""Initialization."""
super
(
Network
,
self
).
__init__
()
self
.
layers
=
nn
.
Sequential
(
nn
.
Linear
(
790
,
64
,
bias
=
True
),
nn
.
Tanh
(),
nn
.
Linear
(
64
,
64
,
bias
=
True
),
nn
.
Tanh
(),
nn
.
Linear
(
64
,
out_dim
,
bias
=
True
)
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Forward method implementation."""
return
self
.
layers
(
x
)
# class Network(nn.Module):
# def __init__(self, in_dim: int, out_dim: int):
# """Initialization."""
# super(Network, self).__init__()
#
#
#
#
#
#
#
# self.num_actions = 27472
# self.state_shape = [790]
# self.mlp_layers = [64, 64]
# print(self.num_actions,self.state_shape,self.mlp_layers )
#
# layer_dims = [np.prod(self.state_shape)] + self.mlp_layers
# print("layer_dims:",layer_dims)
# fc = [nn.Flatten()]
# fc.append(nn.BatchNorm1d(layer_dims[0]))
# for i in range(len(layer_dims)-1):
# fc.append(nn.Linear(layer_dims[i], layer_dims[i+1], bias=True))
# fc.append(nn.Tanh())
# fc.append(nn.Linear(layer_dims[-1], self.num_actions, bias=True))
# self.layers = nn.Sequential(*fc)
#
# def forward(self, x: torch.Tensor) -> torch.Tensor:
# """Forward method implementation."""
# return self.layers(x)
class
DQNAgent
:
"""DQN Agent interacting with environment.
Attribute:
env (gym.Env): openAI Gym environment
memory (ReplayBuffer): replay memory to store transitions
batch_size (int): batch size for sampling
epsilon (float): parameter for epsilon greedy policy
epsilon_decay (float): step size to decrease epsilon
max_epsilon (float): max value of epsilon
min_epsilon (float): min value of epsilon
target_update (int): period for target model's hard update
gamma (float): discount factor
dqn (Network): model to train and select actions
dqn_target (Network): target model to update
optimizer (torch.optim): optimizer for training dqn
transition (list): transition information including
state, action, reward, next_state, done
"""
def
__init__
(
self
,
env
,
memory_size
:
int
,
batch_size
:
int
,
target_update
:
int
,
epsilon_decay
:
float
,
max_epsilon
:
float
,
min_epsilon
:
float
,
gamma
:
float
,
):
"""Initialization.
Args:
env (gym.Env): openAI Gym environment
memory_size (int): length of memory
batch_size (int): batch size for sampling
target_update (int): period for target model's hard update
epsilon_decay (float): step size to decrease epsilon
lr (float): learning rate
max_epsilon (float): max value of epsilon
min_epsilon (float): min value of epsilon
gamma (float): discount factor
"""
#obs_dim = env.observation_space.shape[0]
obs_dim
=
790
#action_dim = env.action_space.n
action_dim
=
27472
self
.
env
=
env
self
.
memory
=
ReplayBuffer
(
obs_dim
,
memory_size
,
batch_size
)
self
.
batch_size
=
batch_size
self
.
epsilon
=
max_epsilon
self
.
epsilon_decay
=
epsilon_decay
self
.
max_epsilon
=
max_epsilon
self
.
min_epsilon
=
min_epsilon
self
.
target_update
=
target_update
self
.
gamma
=
gamma
self
.
scores
=
[]
self
.
losses
=
[]
self
.
epsilons
=
[]
self
.
average
=
[]
self
.
epi_loss_tracker
=
[]
self
.
episode
=
0
self
.
not_logged_yet
=
True
# device: cpu / gpu
self
.
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
print
(
self
.
device
)
# networks: dqn, dqn_target
self
.
dqn
=
Network
(
obs_dim
,
action_dim
).
to
(
self
.
device
)
self
.
dqn_target
=
Network
(
obs_dim
,
action_dim
).
to
(
self
.
device
)
self
.
dqn_target
.
load_state_dict
(
self
.
dqn
.
state_dict
())
self
.
dqn_target
.
eval
()
from
torchinfo
import
summary
print
(
summary
(
self
.
dqn
))
# optimizer
self
.
optimizer
=
optim
.
Adam
(
self
.
dqn
.
parameters
(),
lr
=
learning_rate
)
# if os.path.exists(save_model):
# if not torch.cuda.is_available():
# checkpoint = torch.load(save_model, map_location=torch.device('cpu'))
# else:
# checkpoint = torch.load(save_model)
# self.dqn.load_state_dict(checkpoint['model_state_dict'])
# self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# print("NN parameters loaded from file")
# transition to store in memory
self
.
transition
=
list
()
# mode: train / test
self
.
is_test
=
False
def
select_action
(
self
,
state
:
np
.
ndarray
)
->
np
.
ndarray
:
"""Select an action from the input state."""
# epsilon greedy policy
#print(state)
if
self
.
epsilon
>
np
.
random
.
random
():
selected_action
=
self
.
env
.
action_space_sample
()
#print("random action taken:", selected_action)
else
:
legal_actions
=
list
(
env
.
_get_legal_actions
().
keys
())
#print(legal_actions)
selected_action
=
self
.
dqn
(
torch
.
FloatTensor
(
state
).
to
(
self
.
device
)
)
selected_action
=
selected_action
.
detach
().
cpu
().
numpy
()
liste
=
list
(
selected_action
)
result
=
[
liste
[
i
]
for
i
in
legal_actions
]
selected_action
=
liste
.
index
(
max
(
result
))
#print("selected_action taken:", selected_action)
if
not
self
.
is_test
:
self
.
transition
=
[
state
,
selected_action
]
return
selected_action
def
step
(
self
,
action
:
np
.
ndarray
)
->
Tuple
[
np
.
ndarray
,
np
.
float64
,
bool
]:
"""Take an action and return the response of the env.
Doudizhu2 rlcard env returns sometimes empty states if other players finish round. Therefore nonestate(bool) need to be checked. Feeding None state into NN causes breakage (NaN)
"""
next_state
,
reward
,
done
,
nonestate
=
self
.
env
.
step
(
action
)
if
not
self
.
is_test
and
not
nonestate
:
self
.
transition
+=
[
reward
,
next_state
,
done
]
#print(self.transition)
self
.
memory
.
store
(
*
self
.
transition
)
return
next_state
,
reward
,
done
,
nonestate
def
update_model
(
self
)
->
torch
.
Tensor
:
"""Update the model by gradient descent."""
samples
=
self
.
memory
.
sample_batch
()
#print(samples)
loss
=
self
.
_compute_dqn_loss
(
samples
)
self
.
optimizer
.
zero_grad
()
loss
.
backward
()
self
.
optimizer
.
step
()
return
loss
.
item
()
def
train
(
self
,
num_frames
:
int
,
plotting_interval
:
int
=
200
):
start
=
time
.
time
()
"""Train the agent."""
self
.
is_test
=
False
state
=
self
.
env
.
reset
()
update_cnt
=
0
score
=
0
hilf
=
True
while
self
.
episode
<
num_episodes
:
action
=
self
.
select_action
(
state
)
#print(action)
next_state
,
reward
,
done
,
nonestate
=
self
.
step
(
action
)
state
=
next_state
score
+=
reward
# if episode ends
if
done
:
state
=
self
.
env
.
reset
()
#print(state)
self
.
episode
+=
1
self
.
scores
.
append
(
score
)
score
=
0
hilf
=
True
elif
nonestate
:
state
=
self
.
env
.
reset
()
#print(state)
self
.
episode
+=
1
self
.
scores
.
append
(
score
)
score
=
0
hilf
=
True
# if training is ready
if
len
(
self
.
memory
)
>=
self
.
batch_size
:
loss
=
self
.
update_model
()
self
.
losses
.
append
(
loss
)
update_cnt
+=
1
# linearly decrease epsilon
self
.
epsilon
=
max
(
self
.
min_epsilon
,
self
.
epsilon
-
(
self
.
max_epsilon
-
self
.
min_epsilon
)
*
self
.
epsilon_decay
)
self
.
epsilons
.
append
(
self
.
epsilon
)
# if hard update is needed
if
update_cnt
%
self
.
target_update
==
0
:
self
.
_target_hard_update
()
# plotting
if
self
.
episode
%
evaluate_every
==
0
and
self
.
episode
!=
0
and
hilf
:
hilf
=
False
self
.
average
.
append
(
sum
(
self
.
scores
)
/
len
(
self
.
scores
))
print
(
"logging episode: "
,
self
.
episode
)
print
(
"average winratio: "
,
self
.
average
[
-
1
])
#self._plot(frame_idx, scores, losses, epsilons)
tpe
=
(
time
.
time
()
-
start
)
/
evaluate_every
start
=
time
.
time
()
self
.
logging
()
self
.
_plot
(
num_episodes
,
agent
.
average
,
agent
.
losses
,
agent
.
epsilons
)
print
(
"time per episode:"
,
tpe
)
#print(losses)
#torch.save({'model_state_dict': self.dqn.state_dict(),'optimizer_state_dict': self.optimizer.state_dict()}, save_model)
print
(
"Modell saved"
)
print
(
"================"
)
def
logging
(
self
):
from
torchinfo
import
summary
save_txt
=
os
.
path
.
join
(
save_path
,
'log_DQN.txt'
)
file1
=
open
(
save_txt
,
"a"
,
encoding
=
"utf-8"
)
if
self
.
not_logged_yet
:
print
(
"logging first time"
)
file1
.
write
(
"-----New Run-------"
)
file1
.
write
(
'
\n
'
)
file1
.
write
(
"Parameters:"
)
lines
=
[
"memory_size: {}"
.
format
(
memory_size
),
"batch_size: {}"
.
format
(
batch_size
),
"target_update: {}"
.
format
(
target_update
),
"epsilon_decay: {}"
.
format
(
epsilon_decay
),
"min_epsilon: {}"
.
format
(
self
.
min_epsilon
),
"max_epsilon: {}"
.
format
(
self
.
max_epsilon
),
"gamma: {}"
.
format
(
self
.
gamma
),
"neural net structure: {}"
.
format
(
summary
(
self
.
dqn
))]
#lines=[memory_size,batch_size,target_update,epsilon_decay,self.min_epsilon,self.max_epsilon,self.gamma,datetime.now(),]
for
line
in
lines
:
file1
.
write
(
'
\n
'
)
file1
.
write
(
str
(
line
))
file1
.
write
(
'
\n
'
)
file1
.
write
(
"logged variables: self.episode, self.average[-1],datetime.now():"
)
lines
=
[
"Episode: {}"
.
format
(
self
.
episode
),
"Average: {}"
.
format
(
self
.
average
[
-
1
]),
"Time: {}"
.
format
(
datetime
.
now
()),
"-------"
]
for
line
in
lines
:
file1
.
write
(
'
\n
'
)
file1
.
write
(
str
(
line
))
file1
.
close
()
##Losses
save_logs
=
os
.
path
.
join
(
save_path
,
'log_loss.txt'
)
file2
=
open
(
save_logs
,
"a"
)
if
self
.
not_logged_yet
:
file2
.
write
(
"episode/num_episodes, self.losses, datetime.now()"
)
self
.
not_logged_yet
=
False
file2
.
write
(
'
\n
'
)
file2
.
write
(
"{}/{}"
.
format
(
self
.
episode
,
num_episodes
))
lines
=
[
self
.
losses
,
datetime
.
now
(),
"-------"
]
for
line
in
lines
:
file2
.
write
(
'
\n
'
)
file2
.
write
(
str
(
line
))
file2
.
close
()
def
test
(
self
)
->
List
[
np
.
ndarray
]:
"""Test the agent."""
self
.
is_test
=
True