我想使用 DQL 来教导代理在流体中遵循预定路径。问题是,代理是一个球体游泳者,很像 this one,只是在 2D 中(这意味着它是一个十字形的代理,在尖端和中心有球体)。动作意味着伸展或收回手臂,动作会改变 x 和 y 位置以及角度。例如,如果代理的状态是 [1,0,0,1](手臂 0 和 3 伸展),并且动作是 [0,1,0,1],则下一个状态是 [1,1 ,0,0]。
神经网络的输入是状态(例如 [1,0,0,1])以及位置和角度(例如 0.4,-0.3,0.02)。根据这两者,它应该确定最大化其奖励的最佳动作(通过遵循轨迹)。
有关此路径的唯一信息由奖励函数 r = exp(-abs(path))
给出,其中路径是轨迹方程。假设路径是 y=x^2/10
,所以 path = y-x**2/10
。
基本代码在这里(我不会发布整个代码,否则它会变得一团糟。只够让消息通过):
iters = 20000
N_SPHERES = 5
N_LINKS = N_SPHERES - 1
N_NEURONS = 16
MEM_SIZE = 400
BATCH_SIZE = 200
ALPHA, GAMMA, LAMBDA = 1., 0.8, 0.0005
MAX_EPS, MIN_EPS = 0.85, 0.
UPDATE_INTERVAL = 1
class NN:
def __init__(self,env):
self.env = env
self.num_actions = self.env.num_actions
self.num_states = N_LINKS#self.env.num_states
self.model = self.define_model()
self.model_alt = self.define_model()
with open(dir_+'/params.txt','a+') as fh:
self.model.summary(print_fn = lambda x: fh.write(x+'\n'))
def define_model(self):
self.model = tf.keras.Sequential(name = 'Main_Model_2D')
self.model.add(Dense(N_NEURONS, activation = tf.nn.leaky_relu, input_dim = 7))
#self.model.add(Dense(N_NEURONS, activation = tf.nn.leaky_relu))
#self.model.add(Dense(N_NEURONS/2, activation = tf.nn.leaky_relu))
self.model.add(Dense(self.num_actions, activation = tf.nn.leaky_relu))
self.model.compile(loss = 'mean_squared_error', optimizer = tf.keras.optimizers.Adam())
return self.model
def predict(self,state): #given an input, returns the NN output
def predict_batch(self, states): #given a list of inputs, returns the NN output for those inputs
def predict_alt_batch(self, states): #the same as above but for an alternate NN, used for improved
stabilization
def upgrade_model(self):
self.model_alt.set_weights(self.model.get_weights())
def train(self,dataxy,dataz):
dataxy, dataz = dataxy.tolist(), dataz.tolist()
self.model.fit(dataxy,dataz,epochs = 10, steps_per_epoch = 7, batch_size = BATCH_SIZE,
shuffle = False, verbose=False)
class Memory:
def __init__(self, max_memory):
self._max_memory = max_memory
self._samples = []
def add_sample(self, sample):
self._samples.append(sample)
if len(self._samples) > self._max_memory:
self._samples.pop(0)
def sample(self, no_samples):
if no_samples > len(self._samples):
return random.sample(self._samples, len(self._samples))
else:
return random.sample(self._samples, no_samples)
class Runner:
def run(self):
c_state, Xcg, Ycg, ang = self.env.reset()
tot_rew = 0.
pbar=trange(iters, desc = '', leave = True)
self.steps = 0
for i in pbar:
self.steps = i
c_pos = np.array([Xcg,Ycg,ang])
c_state_pos = np.concatenate((c_state,c_pos))
r1 = self.env.calc_reward(Xcg, Ycg)
action = self.choose_action(c_state_pos)
deltaX, deltaY, deltaTheta = self.env.calc_pos_theta(c_state, action,\
Xcg, Ycg, ang)
Xcg+=deltaX
Ycg+=deltaY
ang+=deltaTheta
if ang > 2*np.pi:
ang += -2*np.pi
n_pos = np.array([Xcg,Ycg,ang])
n_state = self.env.calc_next_step(c_state, action)
n_state_pos = np.concatenate((n_state,n_pos))
r2 = self.env.calc_reward(Xcg, Ycg)
rew = r2 - r1
tot_rew += rew
self.mem.add_sample((c_state_pos, action, rew, n_state_pos))
self.replay()
self.eps = MIN_EPS+(MAX_EPS-MIN_EPS)*np.exp(-LAMBDA * self.steps)
c_state = n_state
c_pos = n_pos
self.steps += 1
if self.steps == iters:
n_state = None
def choose_action(self,state):
chosen_action = np.zeros(self.model.num_states)
if np.random.rand() < self.eps:
chosen_action[random.randint(0, self.model.num_states-1)] = 1
else:
P = self.model.predict(state)
chosen_action[np.argmax(P)] = 1
return chosen_action
def replay(self):
batch = self.mem.sample(BATCH_SIZE)
states = np.array([val[0] for val in batch])
next_states = np.array([(np.zeros(self.model.num_states)
if val[3] is None else val[3]) for val in batch])
Qsa = self.model.predict_batch(states) #Q
Qsad = self.model.predict_alt_batch(next_states) #Q*
x = np.zeros((len(batch), 7))
y = np.zeros((len(batch), self.model.num_actions))
for i, b in enumerate(batch):
s, a, rw, ns = b[0], b[1], b[2], b[3]
c_q = Qsa[i].numpy()
n_q = Qsad[i].numpy()
#s,a,r,ns = state, action, reward, next_state
#c_q, n_q = current_q, next_q
if ns is None:
c_q[np.argmax(a)] = rw
else:
c_q[np.argmax(a)] = ALPHA*(rw+GAMMA*np.amax(n_q))\
+(1-ALPHA)*c_q[np.argmax(a)]
x[i] = s
y[i] = c_q
self.model.train(x, y)
if self.steps % UPDATE_INTERVAL == 0:
model.upgrade_model()
env = Env2D(N_SPHERES)
memory = Memory(MEM_SIZE)
model = NN(env)
r = Runner(env, memory, model, F)
r.run()
我使用的环境是这样的:
class Env2D:
def reset(self):
return np.zeros(self.num_actions), 0., 0., 0.
def calc_next_step(self,state,action):
next_state = np.remainder(state+action,2*np.ones(self.num_actions, dtype=int))
return next_state
def calc_pos_theta(self,state,action,posx,posy,ang):
yada yada yada it calculates the variation in position and angle
return deltaX, deltaY, deltaTHETA
def calc_reward(self,posx,posy):
r = 2*np.exp(-2*abs(posy-posx**2/10))*np.sqrt(posx**2+posy**2)*posx
return r
问题是,它应该轻松遵循轨迹。但它一直到 x=y=10 左右,然后停止。它不会移动到任何地方,有时甚至会完全转移到其他地方。此外,当我将此模型加载到全新的神经网络中时,它不起作用。我应该如何进行?