import numpy as np
from numpy import exp, array, random, dot
R = np.matrix([[-1, -1, -1, -1,1, -1], # for correct action the
reward is 1 and for wrong action it's -1
[-1, -1, -1, 1, -1, 1],
[-1, -1, -1, 1, -1, -1],
[-1, 1, 1, -1, 1, -1],
[-1, 1, 1, -1, -1, 1],
[-1, 1, -1, -1, 1, 1]])
Q = np.matrix(np.zeros([6, 6])) # Q matrix
gamma = 0.99 # Gamma (learning parameter).
lr = 0.1 # learning rate
initial_state = 1 # Initial state.
w = 2*np.random.random((6,1)) - 1 # random weights for each action
def sigmoid(x):
return 1 / (1 + exp(-x))
def sigmoid_derivative(x):
return x * (1 - x)
def available_actions(state): # This function returns all available actions
in the state given as an argument
current_state_row = R[state,]
return current_state_row
def sample_next_action(available_actions_range): # This function chooses at
random which action to be performed within the rangeof all the available
actions.
next_action = int(np.random.choice(available_actions_range, 1))
print("next_action: ", next_action)
return next_action
def update(action, gamma,predict): # This function updates the Q matrix
according to the path selected
Q[current_state, action] = action + gamma * predict
return Q[current_state, action]
for i in range(1000): ## Training
current_state = np.random.randint(0, int(Q.shape[0]))
available_act = np.array(available_actions(current_state))
action = np.max(available_act)
Qout = np.multiply(available_act,w) # forward propagation # multiplying
the selected action with the weights)
predict = sigmoid(np.max(Qout))
Qtarget = update(action, gamma, predict) # target q values(rewards)
loss = np.sum(Qtarget - Qout) # backward prop # calculating the error
between predicted and target q values
adjustment = np.multiply(action ,loss , Qout)
w = w + adjustment # adjusting weights
Q += Qout
print("Trained network:")
print("--------")
print("weights : ", w)
np.set_printoptions(precision=2,suppress=True)
print(Q / np.max(Q) * 100) # Normalize the "trained" Q matrix
我是神经网络和强化学习的新手。我正在尝试创建一个神经网络(用于学习目的),其中包含一个具有6个输入和1个输出的神经元。我面临的问题如下 -
我不确定我通过Q网络预测Q值的方式是否正确?我不确定我正在做的计算是找出目标Q值是否正确。
当我运行代码时,权重变为' Nan'经过几次迭代后,这导致Q矩阵具有所有的Nan值。