我想通过强化学习来对抗随机。 经过几百场比赛,我达到了 32 胜 8 平 10 负(和类似的),但它没有进一步发展,而是崩溃了。 这张图片显示了最近 50 场比赛的获胜(蓝色)计数。
(游戏就像连续 4 个,但连续 5 个,没有重力和无重力,所以你可以在纸上的每个空闲位置上放置 X 和 O。)
经过几千场比赛,最终还是保持在 0.5 的奖励上,即平局或 50:50 的输赢比。
这是我的完整代码示例
主游戏(Env)
N=12
import numpy as np
class Env:
def __init__(self):
self.F = np.zeros(shape=(N,N))
def random_move(self, spieler):
rnd = np.random.randint(0,144)
self.move(spieler=spieler,zug=rnd)
def move(self,spieler,zug):
#
x,y = floor(zug/12), zug%12
if self.F[x,y]==0:
self.F[x,y] = spieler
def reset(self):
self.F=None
self.F = np.zeros(shape=(N,N))
def invert(self):
eins = self.F==1
zwei = self.F==2
self.F[eins]=2
self.F[zwei]=1
def check1(self,x,y,spieler):
d = self.F[x:x+5,y] == spieler
if np.all(d):
return True
return False
def check2(self,x,y,spieler):
d = self.F[x,y:y+5] == spieler
if np.all(d):
return True
return False
def check3(self,x,y,spieler):
d = np.diag(self.F[x:x+5,y:y+5]) == spieler
if np.all(d):
return True
return False
def check4(self,x,y,spieler):
d = np.diag(np.fliplr(self.F[x:x+5,y:y+5])) == spieler
if np.all(d):
return True
return False
def eval(self):
for x in range(N-5):
for y in range(N):
if self.check1(x,y,spieler=1):
return 1
if self.check1(x,y,spieler=2):
return 2
for x in range(N):
for y in range(N-5):
if self.check2(x,y,spieler=1):
return 1
if self.check2(x,y,spieler=2):
return 2
for x in range(N-5):
for y in range(N-5):
if self.check3(x,y,spieler=1):
return 1
if self.check3(x,y,spieler=2):
return 2
for x in range(N-5):
for y in range(N-5):
if self.check4(x,y,spieler=1):
return 1
if self.check4(x,y,spieler=2):
return 2
if np.any(self.F==0):
return 0
return -1
和 TF 部分
from tensorflow import initializers
from tensorflow.python.framework.ops import convert_to_tensor
from tensorflow.python.keras import Sequential, Input
from tensorflow.python.keras.backend import expand_dims
from tensorflow.python.keras.layers import Dense, Dropout, Conv2D, Reshape, Flatten, Conv1D, BatchNormalization, \
MaxPooling2D
import numpy as np
from tensorflow.python.keras.losses import Huber, CategoricalCrossentropy, MeanAbsoluteError, MeanSquaredError
from tensorflow.python.keras.models import Model
import tensorflow as tf
from ParEnv import ParEnv
from main import Env
def convert_reward(rew):
if rew == 1:
return 1
if reward == 2:
return 0
if reward == -1:
return 0.5
def model():
inputs = Input(shape=(12, 12, 1), dtype='float32')
# common
# x = BatchNormalization()(x)
x = Conv2D(64, kernel_size=(7, 7), strides=(2, 2), padding="same")(inputs)
x = Conv2D(128, kernel_size=(5, 5), strides=(2, 2), padding="same")(x)
x = Conv2D(128, kernel_size=(3, 3), strides=(2, 2), padding="same")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dense(256, activation="relu")(x)
# x = Dropout(0.2)(x)
# x = Dense(1024, activation="relu")(x)
# x = Dense(1024, activation="relu")(x)
comom = Dense(512, activation="relu")(x)
# action
x = Dense(128, activation="relu")(comom)
#x = Dense(256, activation="relu")(x)
# x = Dropout(0.2)(x)
# x = Dense(256, activation="relu")(x)
# x = Dense(256, activation="relu")(x)
action = Dense(144, activation="sigmoid")(x)
# critic
x = Dense(128, activation="relu")(comom)
# x = Dense(128, activation="relu")(x)
# x = Dropout(0.1)(x)
x = Dense(64, activation="relu")(x)
critic = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inputs, outputs=[action, critic])
return model
if __name__ == "__main__":
log = open("log.txt", "w")
avg_reward = []
#optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
optimizer = tf.keras.optimizers.Adamax(learning_rate=0.01, beta_1=0.99)
loss = Huber()
games_played = 0
par_games = 1000
cnt_games = 25000
pe = ParEnv(par_games)
env = Env()
player1 = model()
# player2 = model()
for g in range(cnt_games):
action_probs_history = []
critic_value_history = []
reward = 0.0
with tf.GradientTape() as tape:
return_history = []
for batch in range(3):
for i in range(144 // 2):
state = convert_to_tensor(env.F.astype(float))
state = expand_dims(state, 0)
zug_wahrscheinl, critic_value = player1(state)
# print(x)
critic_value_history.append(tf.squeeze(critic_value)) # (tf.squeeze(critic_value))
whl = abs(((np.squeeze(zug_wahrscheinl)) - 0.4999) * 2) # cut out the moves that leads to a lose
tfWhl = tf.squeeze(zug_wahrscheinl)
whl2 = whl / np.sum(whl)
zug1 = np.random.choice(144, p=whl2.flatten())
# zug1 = np.argmax(zug_wahrscheinl)
ab = tfWhl[zug1]
action_probs_history.append(ab)
env.move(spieler=1, zug=zug1)
env.random_move(spieler=2)
reward = env.eval()
done = reward != 0
if done:
# print("Batch ",batch," Game ", g, " Rounds played ", i, " Reward: %.2f" % reward)
env.reset()
avg_reward.append(convert_reward(reward)) #only for the stats
games_played +=1
break
if not done:
# print("Game ", g, " Rounds played ", i, " Draw! \t\t", end=" ")
avg_reward.append(0.5)
games_played += 1
env.reset()
#### Spiel beendet ###
# return_history = [reward for x in range(i)]
if reward == 1: # win
return_history.extend([0.5 / (1.25 ** x) + 0.5 for x in reversed(range(i + 1))]) # gradient von 0.5 hoch auf 1
elif reward == 2: # lose
return_history.extend([0.5 / (1.25 ** x) for x in (range(i + 1))]) # gradient von 0.5 runter auf 0
else: # draw
return_history.extend([0.5 for x in range(i + 1)])
actor_losses = []
critic_losses = []
history = zip(action_probs_history, critic_value_history, return_history)
for prob, value, ret in history:
# actor_losses.append( loss(tf.expand_dims(prob,0), tf.expand_dims(value,0)))
diff = value - ret +1 # force the actor to create high propabilitie values
actor_losses.append(1/(prob+1) * diff)
#actor_losses.append(loss(tf.expand_dims(prob, 0), tf.expand_dims(value, 0)))
critic_losses.append(
loss(tf.expand_dims(ret, 0),tf.expand_dims(value, 0))
)
# Backpropagation
s_actor_loss = tf.reduce_mean(actor_losses) # weil runden unterschiedlich lang sein können
s_critic_loss = tf.reduce_mean(critic_losses)
loss_value = s_actor_loss + s_critic_loss # tf.dtypes.cast(sum(critic_losses), tf.float32)
grads = tape.gradient(loss_value, player1.trainable_variables)
optimizer.apply_gradients(zip(grads, player1.trainable_variables))
# Clear the loss and reward history
action_probs_history.clear()
critic_value_history.clear()
# rewards_history.clear()
# print( "Loss Sum {:.2f} Game {} ".format(loss_value,g))
if g == 1:
player1.summary()
k = np.array(avg_reward)
o = k[-50:]
if g > 5:
print("Played %d games Avg Reward Overall: \t%.3f\tlast 50 Games:\t %.3f Loss\t %.4f (%.4f + %.4f) W-D-L %d %d %d" % ( games_played,
np.mean(k), np.mean(o), loss_value, s_actor_loss, s_critic_loss, np.count_nonzero(o == 1), np.count_nonzero(o == 0.5), np.count_nonzero(o == 0)))
log.write("%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % (
reward, np.mean(k), np.mean(o), np.count_nonzero(o == 1), np.count_nonzero(o == 0.5), np.count_nonzero(o == 0), loss_value, s_actor_loss, s_critic_loss))
log.flush()
# player1.summary()
log.close()
print("hi")
我认为有一个过程,但出了点问题,但我不知道要搜索了。 花了最多的时间考虑演员的损失,我得出了这个结果。 任何人都可以帮助我让模型进一步学习/更稳定吗?