为什么深度 Q 学习不能有效地工作?

时间:2021-05-06 18:43:47

标签: python tensorflow deep-learning

我想使用 DQL 来教导代理在流体中遵循预定路径。问题是,代理是一个球体游泳者,很像 this one,只是在 2D 中(这意味着它是一个十字形的代理,在尖端和中心有球体)。动作意味着伸展或收回手臂,动作会改变 x 和 y 位置以及角度。例如,如果代理的状态是 [1,0,0,1](手臂 0 和 3 伸展),并且动作是 [0,1,0,1],则下一个状态是 [1,1 ,0,0]。

神经网络的输入是状态(例如 [1,0,0,1])以及位置和角度(例如 0.4,-0.3,0.02)。根据这两者,它应该确定最大化其奖励的最佳动作(通过遵循轨迹)。

有关此路径的唯一信息由奖励函数 r = exp(-abs(path)) 给出,其中路径是轨迹方程。假设路径是 y=x^2/10,所以 path = y-x**2/10

基本代码在这里(我不会发布整个代码,否则它会变得一团糟。只够让消息通过):

iters = 20000
N_SPHERES = 5
N_LINKS = N_SPHERES - 1
N_NEURONS = 16
MEM_SIZE = 400
BATCH_SIZE = 200
ALPHA, GAMMA, LAMBDA = 1., 0.8, 0.0005
MAX_EPS, MIN_EPS = 0.85, 0.
UPDATE_INTERVAL = 1

class NN:
    def __init__(self,env):
        self.env = env
        self.num_actions = self.env.num_actions
        self.num_states = N_LINKS#self.env.num_states
        self.model = self.define_model()
        self.model_alt = self.define_model()
        with open(dir_+'/params.txt','a+') as fh:
            self.model.summary(print_fn = lambda x: fh.write(x+'\n'))
    
    def define_model(self):
        self.model = tf.keras.Sequential(name = 'Main_Model_2D')
        self.model.add(Dense(N_NEURONS, activation = tf.nn.leaky_relu, input_dim = 7))
        #self.model.add(Dense(N_NEURONS, activation = tf.nn.leaky_relu))
        #self.model.add(Dense(N_NEURONS/2, activation = tf.nn.leaky_relu))
        self.model.add(Dense(self.num_actions, activation = tf.nn.leaky_relu))
        self.model.compile(loss = 'mean_squared_error', optimizer = tf.keras.optimizers.Adam())
        return self.model
    
    def predict(self,state): #given an input, returns the NN output
    
    def predict_batch(self, states): #given a list of inputs, returns the NN output for those inputs
                   
    def predict_alt_batch(self, states): #the same as above but for an alternate NN, used for improved 
        stabilization

    def upgrade_model(self):
        self.model_alt.set_weights(self.model.get_weights())

    def train(self,dataxy,dataz):
        dataxy, dataz = dataxy.tolist(), dataz.tolist()
        self.model.fit(dataxy,dataz,epochs = 10, steps_per_epoch = 7, batch_size = BATCH_SIZE,
                       shuffle = False, verbose=False)

class Memory:
   def __init__(self, max_memory):
       self._max_memory = max_memory
       self._samples = []

   def add_sample(self, sample):
       self._samples.append(sample)
       if len(self._samples) > self._max_memory:
           self._samples.pop(0)

   def sample(self, no_samples):
       if no_samples > len(self._samples):
           return random.sample(self._samples, len(self._samples))
       else:
           return random.sample(self._samples, no_samples)

class Runner:
    
    def run(self):
       c_state, Xcg, Ycg, ang = self.env.reset()
       tot_rew = 0.
       pbar=trange(iters, desc = '', leave = True)
       self.steps = 0
       for i in pbar:
           self.steps = i
           
           c_pos = np.array([Xcg,Ycg,ang])
           c_state_pos = np.concatenate((c_state,c_pos))
           r1 = self.env.calc_reward(Xcg, Ycg)
           
           action = self.choose_action(c_state_pos)
           deltaX, deltaY, deltaTheta = self.env.calc_pos_theta(c_state, action,\
                                                                Xcg, Ycg, ang)
           
           Xcg+=deltaX
           Ycg+=deltaY
           ang+=deltaTheta
           if ang > 2*np.pi:
               ang += -2*np.pi
               
           n_pos = np.array([Xcg,Ycg,ang])        
           n_state = self.env.calc_next_step(c_state, action)
           n_state_pos = np.concatenate((n_state,n_pos))
           r2 = self.env.calc_reward(Xcg, Ycg)

           rew = r2 - r1
           tot_rew += rew
           
           self.mem.add_sample((c_state_pos, action, rew, n_state_pos))
           self.replay()
           self.eps = MIN_EPS+(MAX_EPS-MIN_EPS)*np.exp(-LAMBDA * self.steps)
           c_state = n_state
           c_pos = n_pos
           self.steps += 1
           if self.steps == iters:
               n_state = None
    
    def choose_action(self,state):
        chosen_action = np.zeros(self.model.num_states)
        if np.random.rand() < self.eps:
            chosen_action[random.randint(0, self.model.num_states-1)] = 1
        else:
            P = self.model.predict(state)
            chosen_action[np.argmax(P)] = 1
        return chosen_action
        
    def replay(self):
       batch = self.mem.sample(BATCH_SIZE)
       states = np.array([val[0] for val in batch])
       next_states = np.array([(np.zeros(self.model.num_states)
                            if val[3] is None else val[3]) for val in batch])
       
       Qsa = self.model.predict_batch(states)           #Q
       Qsad = self.model.predict_alt_batch(next_states) #Q*

       x = np.zeros((len(batch), 7))
       y = np.zeros((len(batch), self.model.num_actions))
       for i, b in enumerate(batch):
           s, a, rw, ns = b[0], b[1], b[2], b[3]
           c_q = Qsa[i].numpy()
           n_q = Qsad[i].numpy()
           #s,a,r,ns = state, action, reward, next_state
           #c_q, n_q = current_q, next_q
           if ns is None:
               c_q[np.argmax(a)] = rw
           else:  
               c_q[np.argmax(a)] = ALPHA*(rw+GAMMA*np.amax(n_q))\
                              +(1-ALPHA)*c_q[np.argmax(a)]
           x[i] = s
           y[i] = c_q
       self.model.train(x, y)
       if self.steps % UPDATE_INTERVAL == 0:
           model.upgrade_model()

env = Env2D(N_SPHERES)
memory = Memory(MEM_SIZE)
model = NN(env)
r = Runner(env, memory, model, F)
r.run()

我使用的环境是这样的:

class Env2D:
    def reset(self):
        return np.zeros(self.num_actions), 0., 0., 0.
    
    def calc_next_step(self,state,action):
        next_state = np.remainder(state+action,2*np.ones(self.num_actions, dtype=int))
        return next_state

    def calc_pos_theta(self,state,action,posx,posy,ang):

        yada yada yada it calculates the variation in position and angle
        return deltaX, deltaY, deltaTHETA

    def calc_reward(self,posx,posy):
        r = 2*np.exp(-2*abs(posy-posx**2/10))*np.sqrt(posx**2+posy**2)*posx
        return r

问题是,它应该轻松遵循轨迹。但它一直到 x=y=10 左右,然后停止。它不会移动到任何地方,有时甚至会完全转移到其他地方。此外,当我将此模型加载到全新的神经网络中时,它不起作用。我应该如何进行?

0 个答案:

没有答案