我正在通过遵循本教程(https://www.youtube.com/watch?v=GJJc1t0rtSU)和使用此python源代码(https://github.com/philtabor/Youtube-Code-Repository/blob/master/ReinforcementLearning/PolicyGradient/DDPG/pendulum/tensorflow/ddpg_orig_tf.py)来实现深度确定性策略梯度(DDPG),以解决我的问题。
动作数为3(alpha,beta,gamma),状态维数为2。我想在[0,1]中获取3个动作的值,所以我更改了输出层(第三层)类Actor的),从文件“ ddpg_orig_tf.py”中的“ tanh”功能转换为“ sigmoid”功能。但是,当我尝试使用该算法解决问题时,它仅在3个操作中获得了值0或1 {0,1},而在时间间隔[0,1]中未随时间变化。我认为问题不在于激活功能,我尝试将其更改为tanh,并且也仅获得了{-1,1}。
var horizontalScroll = window.pageYOffset;
def build_network(self):
with tf.variable_scope(self.name):
self.input = tf.placeholder(tf.float32,
shape=[None, *self.input_dims],
name='inputs')
self.action_gradient = tf.placeholder(tf.float32,
shape=[None, self.n_actions],
name='gradients')
f1 = 1. / np.sqrt(self.fc1_dims)
dense1 = tf.layers.dense(self.input, units=self.fc1_dims,
kernel_initializer=random_uniform(-f1, f1),
bias_initializer=random_uniform(-f1, f1))
batch1 = tf.layers.batch_normalization(dense1)
layer1_activation = tf.nn.relu(batch1)
f2 = 1. / np.sqrt(self.fc2_dims)
dense2 = tf.layers.dense(layer1_activation, units=self.fc2_dims,
kernel_initializer=random_uniform(-f2, f2),
bias_initializer=random_uniform(-f2, f2))
batch2 = tf.layers.batch_normalization(dense2)
layer2_activation = tf.nn.relu(batch2)
f3 = 0.003
mu = tf.layers.dense(layer2_activation, units=self.n_actions,
activation='sigmoid',
kernel_initializer= random_uniform(-f3, f3),
bias_initializer=random_uniform(-f3, f3))
self.mu = tf.multiply(mu, self.action_bound)
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
from os import path
class P_NOMAEnv():
def __init__(self, distance1, distance2, power, B=15000, N0=10**-20, path_loss=2, g=1):
self.B = B #bandwidth
self.N0 = N0
self.path_loss = path_loss
self.g = g
self.alpha_low = 0.
self.alpha_high = 1.
self.beta_low = 0.
self.beta_high = 1.
self.gamma_low = 0.
self.gamma_high = 1.
self.distance1 = np.random.randint(30,500)
self.distance2 = 2*distance1
self.power = power
self.max_iteration = 1000
self.high = np.array([self.B, self.power])
self.action_space = spaces.Box(low=0., high=1., shape=(3,), dtype=np.float32)
self.observation_space = spaces.Box(low=np.array([0.1, 0.0001]), high=np.array([self.B, self.power]), dtype=np.float32)
self.seed()
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def cal_SINR_near(self, alpha, beta, gamma, g, distance1, path_loss, power, B, N0):
h_near = g*(distance1**-path_loss)
channel_noise = B*N0 # 1 subchannel
non_overlap = (np.square(np.absolute(h_near))*power*0.5*(1-beta))/channel_noise
overlap = (np.square(np.absolute(h_near))*power*gamma*(alpha+beta)*0.5)/(channel_noise + (np.square(np.absolute(h_near))*power*(1-gamma)*(alpha+beta)*0.5))
SINR_near = non_overlap + overlap
return SINR_near
def cal_SINR_far(self, alpha, beta, gamma, g, distance2, path_loss, power, B, N0):
h_far = g*(distance2**-path_loss)
channel_noise = B*N0 # 1 subchannel
non_overlap = (np.square(np.absolute(h_far))*power*0.5*(1-alpha))/channel_noise
overlap = (np.square(np.absolute(h_far))*power*(1-gamma)*(alpha+beta)*0.5)/(channel_noise
+ (np.square(np.absolute(h_far))*power*gamma*(alpha+beta)*0.5))
SINR_far = non_overlap + overlap
return SINR_far
def cal_sum_rate(self, SINR_near, SINR_far, B, alpha, beta):
R_near = (1+alpha)*0.5*B*np.log2(1+SINR_near)
R_far = (1+beta)*0.5*B*np.log2(1+SINR_far)
sum_rate = R_near + R_far # reward
return sum_rate
def normalize(self, x):
normalized = (x+1.2)/2.4
return normalized
def step(self, action):
self.steps_taken += 1
B,P = self.state
new_alpha = np.clip(action, self.alpha_low, self.alpha_high)[0]
new_beta = np.clip(action, self.beta_low, self.beta_high)[1]
new_gamma = np.clip(action, self.gamma_low, self.gamma_high)[2]
SINR_near = self.cal_SINR_near(new_alpha, new_beta, new_gamma, self.g, self.distance1, self.path_loss, self.power, self.B, self.N0)
SINR_far = self.cal_SINR_far(new_alpha, new_beta, new_gamma, self.g, self.distance2, self.path_loss, self.power, self.B, self.N0)
reward = self.cal_sum_rate(SINR_near, SINR_far, self.B, new_alpha, new_beta)
done = self.steps_taken >= self.max_iteration
B_new=(1-new_beta)*0.5*self.B + (new_alpha+new_beta)*0.5*self.B
P_new=(1-new_beta)*0.5*self.power + (new_alpha+new_beta)*0.5*new_gamma*self.power
self.state = np.array([B_new, P_new])
return self._get_obs(action), reward, done, {}, new_alpha, new_beta, new_gamma
def _get_obs(self, action):
new_alpha = np.clip(action, self.alpha_low, self.alpha_high)[0]
new_beta = np.clip(action, self.beta_low, self.beta_high)[1]
new_gamma = np.clip(action, self.gamma_low, self.gamma_high)[2]
B_new=(1-new_beta)*0.5*self.B + (new_alpha+new_beta)*0.5*self.B
P_new=(1-new_beta)*0.5*self.power + (new_alpha+new_beta)*0.5*new_gamma*self.power
return np.array([B_new, P_new])
def reset(self):
self.steps_taken = 0
a = np.random.random_sample((3,))
self.state = self._get_obs(a)
return self._get_obs(a)
import os
import gym
import numpy as np
from ddpg import Agent
from Environment import P_NOMAEnv
from utils import plotLearning
if __name__ == '__main__':
a = np.random.randint(30,250)
env = P_NOMAEnv(distance1=100, distance2=200, power=2, B=15000, N0=10**-20, path_loss=2, g=1)
agent = Agent(alpha=0.00005, beta=0.0005, input_dims=[2], tau=0.001,
env=env, batch_size=64, layer1_size=400, layer2_size=300,
n_actions=3)
np.random.seed(0)
score_history = []
score_history2 = []
for i in range(4000):
obs = env.reset()
done = False
score = 0
maxi = np.zeros(4,)
while not done:
act = agent.choose_action(obs)
new_state, reward, done, info, alpha, beta, gamma = env.step(act)
agent.remember(obs, act, reward, new_state, int(done))
agent.learn()
score += reward
obs = new_state
if reward>maxi[0]: maxi = [reward, alpha, beta, gamma]
#env.render()
score_history.append(maxi[0])
score_history2.append(score/1000)
print('episode ', i+1, ', reward ', np.around(score/1000, decimals=4), ', max reward: ', np.around(maxi[0], decimals=4), ', with alpha: ', np.around(alpha, decimals=4), ', beta: ', np.around(beta, decimals=4), ', gamma: ', np.around(gamma, decimals=4), 'trailing 100 episodes avg ', np.mean(score_history2[-100:]))
当我运行main时,结果显示如下:
噪声:[-0.26362168 -0.01389367 -0.39754398]亩:[[1。 0. 0.]] mu_prime:[0.73637832 -0.01389367 -0.39754398]
噪声:[-0.29287953 -0.03729832 -0.39651476]亩:[[1。 0. 0.]] mu_prime:[0.70712047 -0.03729832 -0.39651476]
.........
如您所见,mu始终获得值0或1 {0,1},而不是在间隔[0,1]中。我尝试了1000多集,但随着时间的推移并没有改变。
问题可能在我的环境中,但我不知道如何解决。如果您对此有任何想法,请帮助我解决,谢谢。
答案 0 :(得分:0)
您不能使用sigmoid
进行多标签输出。您需要使用softmax