使用强化学习进行参数估计

时间:2020-03-04 05:14:23

标签: python keras reinforcement-learning odeint

我想使用强化倾斜进行参数估计。基本上我有一个常微分方程(ODE) y'= acos(wt + d)* w 。实际函数( y = asin(wt + d))所以我想要估计 a w d 。我用scipy.integrate.odeint解决了这个ODE。
在我的程序中,最初a,w和d在某个范围内随机生成。之后,Agent(算法)采取了一些措施(行动只是像固定,增加或减少那样改变a,w,d的值,因此基本上Agent可以从27种不同的行动中采取一种行动。)。
在那个环境下使用odeint()解决ode之后,我计算均方误差作为损耗。我认为损失是负面奖励。如果损失为零,则完成。 状态元组是[a,w,d]。
请告诉我我的方向正确吗?接下来我该怎么办。我还有一个疑问,那就是在训练后的机器学习中,我们只有一个模型,我们必须进行类似的测试,如果可能的话,我们如何在这里做呢。
我有两个文件,一个是环境,另一个是代理。
环境

`import numpy as np 
import math
from scipy.integrate import odeint
import random

class solver():
    def get_experiment_data(self):      
        count = 0
        for i in self.time:    
            self.y_true[count] = self.data_generator_sin(2, 2 * math.pi/12, math.pi/4, i)
            count += 1
    def data_generator_sin(self,a, w, d, t):
        return a*math.sin(w*t + d)
    def reset(self):
        self.a = random.uniform(1.5,2.5)
        self.w = random.uniform(0.25,0.75)
        self.d = random.uniform(0.5, 0.8)
        self.loss = 0.0
        return [self.a, self.w, self.d]

    def step1(self, action):
        self.reward = 0
        self.done = 0
        #0 -> fixed, 1 -> increase, 2 -> decrease
        # awd
        # if action == 0:
        #   #000
        #   # self.a += self.a*0.1
        #   # self.w -= self.w*0.1
        #   # self.d -= self.d*0.1
        if action == 1:
            #001
            # self.a += self.a*0.1
            # self.w -= self.w*0.1
            self.d += self.d*0.1
        elif action == 2:
            #002
            # self.a += self.a*0.1
            # self.w -= self.w*0.1
            self.d -= self.d*0.1
        elif action == 3:
            #010
            #self.a += self.a*0.1
            self.w += self.w*0.1
            #self.d += self.d*0.1
        elif action == 4:
            #011
            #self.a += self.a*0.1
            self.w += self.w*0.1
            self.d += self.d*0.1
        elif action == 5:
            #012
            #self.a += self.a*0.1
            self.w += self.w*0.1
            self.d -= self.d*0.1
        elif action == 6:
            #020
            #self.a += self.a*0.1
            self.w -= self.w*0.1
            #self.d += self.d*0.1
        elif action == 7:
            #021
            #self.a += self.a*0.1
            self.w -= self.w*0.1
            self.d += self.d*0.1
        elif action == 8:
            #022
            #self.a += self.a*0.1
            self.w -= self.w*0.1
            self.d -= self.d*0.1
        elif action == 9:
            #100
            self.a += self.a*0.1
            #self.w -= self.w*0.1
            #self.d += self.d*0.1
        elif action == 10:
            #101
            self.a += self.a*0.1
            #self.w -= self.w*0.1
            self.d += self.d*0.1
        elif action == 11:
            #102
            self.a += self.a*0.1
            #self.w -= self.w*0.1
            self.d -= self.d*0.1
        elif action == 12:
            #110
            self.a += self.a*0.1
            self.w += self.w*0.1
            #self.d += self.d*0.1
        elif action == 13:
            #111
            self.a += self.a*0.1
            self.w += self.w*0.1
            self.d += self.d*0.1
        elif action == 14:
            #112
            self.a += self.a*0.1
            self.w += self.w*0.1
            self.d -= self.d*0.1
        elif action == 15:
            #120
            self.a += self.a*0.1
            self.w -= self.w*0.1
            #self.d += self.d*0.1
        elif action == 16:
            #121
            self.a += self.a*0.1
            self.w -= self.w*0.1
            self.d += self.d*0.1
        elif action == 17:
            #122
            self.a += self.a*0.1
            self.w -= self.w*0.1
            self.d -= self.d*0.1
        elif action == 18:
            #200
            self.a -= self.a*0.1
            #self.w += self.w*0.1
            #self.d += self.d*0.1
        elif action == 19:
            #201
            self.a -= self.a*0.1
            #self.w += self.w*0.1
            self.d += self.d*0.1
        elif action == 20:
            #202
            self.a -= self.a*0.1
            #self.w += self.w*0.1
            self.d -= self.d*0.1
        elif action == 21:
            #210
            self.a -= self.a*0.1
            self.w += self.w*0.1
            #self.d += self.d*0.1
        elif action == 22:
            #211
            self.a -= self.a*0.1
            self.w += self.w*0.1
            self.d += self.d*0.1
        elif action == 23:
            #212
            self.a -= self.a*0.1
            self.w += self.w*0.1
            self.d -= self.d*0.1
        elif action == 24:
            #220
            self.a -= self.a*0.1
            self.w -= self.w*0.1
            #self.d += self.d*0.1
        elif action == 25:
            #221
            self.a -= self.a*0.1
            self.w -= self.w*0.1
            self.d += self.d*0.1
        elif action == 26:
            #222
            self.a -= self.a*0.1
            self.w -= self.w*0.1
            self.d -= self.d*0.1
        self.run_frame()

        state = [self.a, self.w, self.d]
        return self.reward, state, self.done
    def dydx(y, t, *args):
        a = args[0]
        w = args[1]
        d = args[2]
        return a * math.cos(w * t + d) * w
    def run_frame(self):
        parameter = (self.a, self.w, self.d)
        odeint_solver_data = odeint(self.dydx, 0, self.time, parameter).flatten()
        # print(type(odeint_solver_data))
        self.loss = np.square(np.subtract(odeint_solver_data,self.y_true)).mean() 
        self.reward = -self.loss
        # if self.loss < 0.5:
        #   self.done = 1
    def __init__(self):

        self.reward = 0
        self.done = False
        self.a = random.uniform(1.5,2.5)
        self.w = random.uniform(0.25,0.75)
        self.d = random.uniform(0.5, 0.8)
        self.loss = 0.0
        self.start = 0
        self.stop = 24
        self.step = 0.2
        #evenly spaced values within a given interval(0.1)
        self.time = np.arange(self.start, self.stop, self.step)
        self.y_true = np.zeros((len(self.time),), dtype=float)
        self.get_experiment_data()
        # print(self.a, self.w, self.d)
        # print(sel)`
**agent**
`from environment import solver
import random
import numpy as np
from keras import Sequential
from collections import deque
from keras.layers import Dense
import matplotlib.pyplot as plt
from keras.optimizers import adam

env = solver()
np.random.seed(0)


class DQN:
    """ Implementation of deep q learning algorithm """
    def __init__(self, action_space, state_space):
        self.action_space = action_space
        self.state_space = state_space
        self.epsilon = 1
        self.gamma = .95
        self.batch_size = 64
        self.epsilon_min = .01
        self.epsilon_decay = .995
        self.learning_rate = 0.01
        self.memory = deque(maxlen=100000)
        self.model = self.build_model()

    def build_model(self):

        model = Sequential()
        model.add(Dense(64, input_shape=(self.state_space,), activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(self.action_space, activation='linear'))
        model.compile(loss='mse', optimizer=adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        # print(state)
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        states = np.array([i[0] for i in minibatch])
        actions = np.array([i[1] for i in minibatch])
        rewards = np.array([i[2] for i in minibatch])
        next_states = np.array([i[3] for i in minibatch])
        dones = np.array([i[4] for i in minibatch])

        states = np.squeeze(states)
        next_states = np.squeeze(next_states)

        targets = rewards + self.gamma*(np.amax(self.model.predict_on_batch(next_states), axis=1))*(1-dones)
        targets_full = self.model.predict_on_batch(states)

        ind = np.array([i for i in range(self.batch_size)])
        targets_full[[ind], [actions]] = targets

        self.model.fit(states, targets_full, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


def train_dqn(episode):

    loss = []
    agent = DQN(27, 3)
    for e in range(episode):
        state = env.reset()
        state = np.reshape(state, (1, 3))
        # print(state)
        score = 0
        max_steps = 1000
        for i in range(max_steps):
            action = agent.act(state)
            # print(type(action))
            reward, next_state, done = env.step1(action)
            score += reward
            next_state = np.reshape(next_state, (1, 3))
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            agent.replay()
            if done:
                print("episode: {}/{}, score: {}".format(e, episode, score))
                break
        print("episode : {}/{}, state : {}, reward : {},".format(e, episode, state, reward))
        loss.append(score)
    return loss


if __name__ == '__main__':

    ep = 100
    loss = train_dqn(ep)
    plt.plot([i for i in range(ep)], loss)
    plt.xlabel('episodes')
    plt.ylabel('reward')
    plt.show()
`

0 个答案:

没有答案