我应该创建自己的环境并在多代理环境中应用 dqn 算法。
我有 4个代理商。我的环境的每个状态都有 5个变量 state=[p1, p2, p3, p4,p5]
,在每个时间步,我们都会更新所有状态的不同参数。行动是金额之一:{-2,-1,0,1,2}
给出最佳q值。
param0,param1,param2,param3,param4=[[0 for x in range(numframe)] for y in range(number_nodes)]
`timestep p4[agent0]=random.randint(0,2)
p4[agent1]=p4[agent0]+action
p4[agent2]=p4[agent1]+action
p4[agent3]=p4[agent2]+action
(actions find by a DNN in dqn and can be one of {-2,-1,0,1,2})`
param0..5=[[0 for x in range(numframe)] for y in range(number_nodes)]
numframe:显示体验 - 重播的数量,number_nodes = 4显示代理数量
我根据[dqn-keras-code] [1],
编写了以下代码 1-如何将其更改为多代理?
2-我怎么能改变写我的重置? (我应该重置为0
每个参数)
我写了一些代码但是因为我是dqn和multi-agent的初学者,我看到了以下错误:(我知道它也有一些与多代理相关的问题)
line 156, in <module>
state = env.reset()
TypeError: reset() missing 1 required positional argument: 'self'
如果我可以修复重置部分和步骤部分,您能否帮我解决一下这个错误?
这是我的代码:
import random
import numpy as np
import tensorflow as tf
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
#-----------------------------------------------------------------
global param0,param1,param2,param3,param4,state,next_state,action_space,action_size,w,m, reward,episodes,time_t,state
#--------------------------------------------------------------------------
episodes=2000
number_nodes=5 #one more than number of nodes
timemax=500
action_size=5
state_size=5
action_space=[-2,-1,0,1,2]
m=16 #4*(ltime+ftime)=16
numframe=16
#-------------------------------------------------------------------------
class env:
def __init__(self):
self.action_space=[-2,-1,0,1,2] # X=[-2,2]
self.action_size = len(self.action_space)
self.state = None
return action_space, action_size
def reset(self):
#self.action_space=[0,0,0,0,0]
for ii in range (1,4): #both sides
param1[ii]=0
param2[ii]=0
param3[ii]=0
param4[ii]=0
param0[ii]=0
reward[ii]=0
state[ii]=[param0[ii],param1[ii],param2[ii],param3[ii],param4[ii]]
return state
# def reset(self):
# self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
# self.steps_beyond_done = None
# return np.array(self.state)
def step(self,action):
state = self.state
param1, param2, param3, param4, param0 = state
param0[0]=random.randint(0,2) #produce a random param0
#relationship between parameteres for refreshing
param0[1]=param0[0]+action
param0[2]=param0[1]+action
param0[3]=param0[2]+action
param0[4]=param0[3]+action
for i in range (1,4):
param1[time_t][i]=param4[time_t][i+1]-param0[i+1]
#action[i]=agent.init(state_size, action_size)
#relationship between parameteres for refreshing
param2[time_t][i]=param0[i]+action
param3[time_t][i]=param2[time_t][i]
param4[time_t][i]=param3[time_t][i]
#param1,param3,param4,param0
next_state[i]=[param1[time_t][i],param2[time_t][i],param3[time_t][i],param4[time_t][i],param0[i]]
cp= [2, 0, 0, 0]
ch= [2, 2, 2, 2]
# reward function
if param1[i]>=0:
reward[i]+=ch[i]*param1[time_t][i]
else:
reward[i]+=cp[i]*param1[time_t][i]
return next_state, reward
#-------------------------------------------------
class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=2000)
self.gamma = 0.95 # discount rate
self.epsilon = 1.0 # exploration rate
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.learning_rate = 0.001
self.model = self._build_model()
def _build_model(self):
# Neural Net for Deep-Q learning Model
model = Sequential()
model.add(Dense(24, input_dim=self.state_size, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(self.action_size, activation='linear'))
model.compile(loss='mse',
optimizer=Adam(lr=self.learning_rate))
return model
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
if np.random.rand() <= self.epsilon:
return random.randrange(self.action_size)
act_values = self.model.predict(state)
return np.argmax(act_values[0]) # returns action
def replay(self, batch_size):
minibatch = random.sample(self.memory, batch_size)
for state, action, reward, next_state, done in minibatch:
target = reward
if not done:
target = (reward + self.gamma *
np.amax(self.model.predict(next_state)[0]))
target_f = self.model.predict(state)
target_f[0][action] = target
self.model.fit(state, target_f, epochs=1, verbose=0)
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
def load(self, name):
self.model.load_weights(name)
def save(self, name):
self.model.save_weights(name)
if __name__ == "__main__":
#env = gym.make('CartPole-v1')
#state_size = env.observation_space.shape[0]
#action_size = env.action_space.n
state_size=4
action_size=5
agent = DQNAgent(state_size, action_size)
# agent.load("./save/cartpole-dqn.h5")
done = False
batch_size = 32
for e in range(episodes):
state = env.reset()
state = np.reshape(state, [1, state_size])
for time in range(500):
# env.render()
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
reward = reward if not done else -10
next_state = np.reshape(next_state, [1, state_size])
agent.remember(state, action, reward, next_state, done)
state = next_state
if done:
print("episode: {}/{}, score: {}, e: {:.2}"
.format(e, EPISparam2DES, time, agent.epsilon))
break
if len(agent.memory) > batch_size:
agent.replay(batch_size)
# if e % 10 == 0:
# agent.save("./save/cartpole-dqn.h5")
agent = DQNAgent(state_size, action_size)
# agent.load("./save/cartpole-dqn.h5")
[1]: https://github.com/keon/deep-q-learning/blob/master/dqn.py