为什么Sigmoid函数的输出层获取值0或1 {0,1}而不是获取[0,1]中的值

时间:2019-11-06 02:27:41

标签: python tensorflow deep-learning reinforcement-learning activation-function

我正在通过遵循本教程(https://www.youtube.com/watch?v=GJJc1t0rtSU)和使用此python源代码(https://github.com/philtabor/Youtube-Code-Repository/blob/master/ReinforcementLearning/PolicyGradient/DDPG/pendulum/tensorflow/ddpg_orig_tf.py)来实现深度确定性策略梯度(DDPG),以解决我的问题。

动作数为3(alpha,beta,gamma),状态维数为2。我想在[0,1]中获取3个动作的值,所以我更改了输出层(第三层)类Actor的),从文件“ ddpg_orig_tf.py”中的“ tanh”功能转换为“ sigmoid”功能。但是,当我尝试使用该算法解决问题时,它仅在3个操作中获得了值0或1 {0,1},而在时间间隔[0,1]中未随时间变化。我认为问题不在于激活功能,我尝试将其更改为tanh,并且也仅获得了{-1,1}。

  • 这是我在“ ddpg_orig_tf.py”中更改的代码:我在输出层中将“ tanh”更改为“ sigmoid”
var horizontalScroll = window.pageYOffset;
  • 这是我的环境:
def build_network(self):
        with tf.variable_scope(self.name):
            self.input = tf.placeholder(tf.float32,
                                        shape=[None, *self.input_dims],
                                        name='inputs')

            self.action_gradient = tf.placeholder(tf.float32,
                                          shape=[None, self.n_actions],
                                          name='gradients')

            f1 = 1. / np.sqrt(self.fc1_dims)
            dense1 = tf.layers.dense(self.input, units=self.fc1_dims,
                                     kernel_initializer=random_uniform(-f1, f1),
                                     bias_initializer=random_uniform(-f1, f1))
            batch1 = tf.layers.batch_normalization(dense1)
            layer1_activation = tf.nn.relu(batch1)
            f2 = 1. / np.sqrt(self.fc2_dims)
            dense2 = tf.layers.dense(layer1_activation, units=self.fc2_dims,
                                     kernel_initializer=random_uniform(-f2, f2),
                                     bias_initializer=random_uniform(-f2, f2))
            batch2 = tf.layers.batch_normalization(dense2)
            layer2_activation = tf.nn.relu(batch2)
            f3 = 0.003
            mu = tf.layers.dense(layer2_activation, units=self.n_actions,
                            activation='sigmoid',
                            kernel_initializer= random_uniform(-f3, f3),
                            bias_initializer=random_uniform(-f3, f3))
            self.mu = tf.multiply(mu, self.action_bound)

  • 这是我的主文件:
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
from os import path

class P_NOMAEnv():
    def __init__(self, distance1, distance2, power, B=15000, N0=10**-20, path_loss=2, g=1):

        self.B = B #bandwidth
        self.N0 = N0
        self.path_loss = path_loss
        self.g = g
        self.alpha_low = 0.
        self.alpha_high = 1.
        self.beta_low = 0.
        self.beta_high = 1.
        self.gamma_low = 0.
        self.gamma_high = 1.
        self.distance1 = np.random.randint(30,500)
        self.distance2 = 2*distance1
        self.power = power
        self.max_iteration = 1000

        self.high = np.array([self.B, self.power])
        self.action_space = spaces.Box(low=0., high=1., shape=(3,), dtype=np.float32)
        self.observation_space = spaces.Box(low=np.array([0.1, 0.0001]), high=np.array([self.B, self.power]), dtype=np.float32)

        self.seed()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]


    def cal_SINR_near(self, alpha, beta, gamma, g, distance1, path_loss, power, B, N0):
        h_near = g*(distance1**-path_loss)
        channel_noise = B*N0 # 1 subchannel
        non_overlap = (np.square(np.absolute(h_near))*power*0.5*(1-beta))/channel_noise
        overlap = (np.square(np.absolute(h_near))*power*gamma*(alpha+beta)*0.5)/(channel_noise + (np.square(np.absolute(h_near))*power*(1-gamma)*(alpha+beta)*0.5))
        SINR_near = non_overlap + overlap
        return SINR_near



    def cal_SINR_far(self, alpha, beta, gamma, g, distance2, path_loss, power, B, N0):
        h_far = g*(distance2**-path_loss)
        channel_noise = B*N0 # 1 subchannel
        non_overlap = (np.square(np.absolute(h_far))*power*0.5*(1-alpha))/channel_noise
        overlap = (np.square(np.absolute(h_far))*power*(1-gamma)*(alpha+beta)*0.5)/(channel_noise
                  + (np.square(np.absolute(h_far))*power*gamma*(alpha+beta)*0.5))
        SINR_far = non_overlap + overlap
        return SINR_far

    def cal_sum_rate(self, SINR_near, SINR_far, B, alpha, beta):
        R_near = (1+alpha)*0.5*B*np.log2(1+SINR_near)
        R_far = (1+beta)*0.5*B*np.log2(1+SINR_far)
        sum_rate = R_near + R_far # reward
        return sum_rate

    def normalize(self, x):
        normalized = (x+1.2)/2.4
        return normalized


    def step(self, action):
        self.steps_taken += 1
        B,P = self.state
        new_alpha = np.clip(action, self.alpha_low, self.alpha_high)[0]
        new_beta = np.clip(action, self.beta_low, self.beta_high)[1]
        new_gamma = np.clip(action, self.gamma_low, self.gamma_high)[2]
        SINR_near = self.cal_SINR_near(new_alpha, new_beta, new_gamma, self.g, self.distance1, self.path_loss, self.power, self.B, self.N0)
        SINR_far = self.cal_SINR_far(new_alpha, new_beta, new_gamma, self.g, self.distance2, self.path_loss, self.power, self.B, self.N0)
        reward = self.cal_sum_rate(SINR_near, SINR_far, self.B, new_alpha, new_beta)
        done = self.steps_taken >= self.max_iteration

        B_new=(1-new_beta)*0.5*self.B + (new_alpha+new_beta)*0.5*self.B
        P_new=(1-new_beta)*0.5*self.power + (new_alpha+new_beta)*0.5*new_gamma*self.power
        self.state = np.array([B_new, P_new])
        return self._get_obs(action), reward, done, {}, new_alpha, new_beta, new_gamma

    def _get_obs(self, action):
        new_alpha = np.clip(action, self.alpha_low, self.alpha_high)[0]
        new_beta = np.clip(action, self.beta_low, self.beta_high)[1]
        new_gamma = np.clip(action, self.gamma_low, self.gamma_high)[2]

        B_new=(1-new_beta)*0.5*self.B + (new_alpha+new_beta)*0.5*self.B
        P_new=(1-new_beta)*0.5*self.power + (new_alpha+new_beta)*0.5*new_gamma*self.power

        return np.array([B_new, P_new])

    def reset(self):
        self.steps_taken = 0

        a = np.random.random_sample((3,))
        self.state = self._get_obs(a)
        return self._get_obs(a)

  • 我试图打印出噪音,mu(演员的输出):
import os
import gym
import numpy as np
from ddpg import Agent
from Environment import P_NOMAEnv
from utils import plotLearning

if __name__ == '__main__':
    a = np.random.randint(30,250)
    env = P_NOMAEnv(distance1=100, distance2=200, power=2, B=15000, N0=10**-20, path_loss=2, g=1)
    agent = Agent(alpha=0.00005, beta=0.0005, input_dims=[2], tau=0.001,
                  env=env, batch_size=64, layer1_size=400, layer2_size=300,
                  n_actions=3)
    np.random.seed(0)
    score_history = []
    score_history2 = []
    for i in range(4000):
        obs = env.reset()
        done = False
        score = 0
        maxi = np.zeros(4,)
        while not done:
            act = agent.choose_action(obs)
            new_state, reward, done, info, alpha, beta, gamma = env.step(act)
            agent.remember(obs, act, reward, new_state, int(done))
            agent.learn()
            score += reward
            obs = new_state
            if reward>maxi[0]: maxi = [reward, alpha, beta, gamma]
            #env.render()
        score_history.append(maxi[0])
        score_history2.append(score/1000)
        print('episode ', i+1, ', reward ', np.around(score/1000, decimals=4), ', max reward: ', np.around(maxi[0], decimals=4), ', with alpha: ', np.around(alpha, decimals=4), ', beta: ', np.around(beta, decimals=4), ', gamma: ', np.around(gamma, decimals=4), 'trailing 100 episodes avg ', np.mean(score_history2[-100:]))

当我运行main时,结果显示如下:

噪声:[-0.26362168 -0.01389367 -0.39754398]亩:[[1。 0. 0.]] mu_prime:[0.73637832 -0.01389367 -0.39754398]

噪声:[-0.29287953 -0.03729832 -0.39651476]亩:[[1。 0. 0.]] mu_prime:[0.70712047 -0.03729832 -0.39651476]

.........

如您所见,mu始终获得值0或1 {0,1},而不是在间隔[0,1]中。我尝试了1000多集,但随着时间的推移并没有改变。

问题可能在我的环境中,但我不知道如何解决。如果您对此有任何想法,请帮助我解决,谢谢。

1 个答案:

答案 0 :(得分:0)

您不能使用sigmoid进行多标签输出。您需要使用softmax