Question

在我的输出上使用sigmoid激活功能，网络比线性激活功能更好地学习任务。

我正在使用L2正则化和我的成本函数，我有一个学习率和动量项，但它通过sigmoid激活函数学得更好。

我可以做些什么来改善结果？

 import csv
 from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
 import numpy as np
 import numpy.random as r
 import matplotlib.pyplot as plt
 import random

 random.seed(25)

def readcsv(filename):
    ifile = open(filename, "rU")
    reader = csv.reader(ifile, delimiter=",")

    rownum = 0
    dataset = []

    for row in reader:
        dataset.append(row)
        rownum += 1
    data = []
    for s in dataset:
        Dataset = [float(i) for i in s]
        data.append(Dataset)
    return [data, rownum]

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_deriv(x):
    return x * (1 - x)

def initialise_weights(nn_structure):
    weights = {}
    bias = {}
    c_weights = {1: np.ones((nn_structure[1], nn_structure[1]))}
    context = {0: r.random_sample((nn_structure[1]))}
    for l in range(1, len(nn_structure)):
        q = []
        for j in range(1, nn_structure[l] + 1):
            w = [random.uniform(-0.09, 0.09) for i in range(nn_structure[l-1])]
            q.append(w)
        weights[l] = np.array(q)
       bias[l] = r.random_sample((nn_structure[l],))
    print(weights)
    return weights, bias, c_weights, context


def initialise_weights_changes(nn_structure):
    deltaweights = {}
    deltabias = {}
    deltac_weights = {1: np.zeros((nn_structure[1], nn_structure[1]))}
    for l in range(1, len(nn_structure)):
        deltaweights[l] = np.zeros((nn_structure[l], nn_structure[l-1]))
        deltabias[l] = np.zeros((nn_structure[l],))
   return deltaweights, deltabias, deltac_weights


def feed_forward(x, weights, bias, c_weights, context, i, hidden_layer):
    hidden_layer[i] = {1: x}
    activations = {}
    for l in range(1, len(weights) + 1):
       node_in = hidden_layer[i][l]
       if l == 1:
          activations[l+1] = sigmoid(weights[l].dot(node_in) + c_weights[l].dot(context[i]) + bias[l])
       else:
            activations[l+1] = weights[l].dot(node_in) + bias[l]
       hidden_layer[i][l+1] = activations[l+1]
       if l == 1:
           context[i + 1] = hidden_layer[i][l+1]
    return hidden_layer, activations


def calculate_out_layer_delta(y, hidden_layer):#, activations):
    return -(y - hidden_layer) 


def calculate_hidden_delta(delta_plus_1, weights_l):#, z_l):
    return np.dot(np.transpose(weights_l), delta_plus_1) 

def train_nn(nn_structure, X, y, iter_num=1000, alpha=0.6, momentum = 0.4, bptt = 5, reg = 0.0000009):
    weights, bias, c_weights, context = initialise_weights(nn_structure)
    cnt = 0
    m = len(y)
    avg_cost_func = []
    print('Starting gradient descent for {} iterations'.format(iter_num))

    while cnt < iter_num:
          if cnt%1000 == 0:
             print('Iteration {} of {}'.format(cnt, iter_num))
         deltaweights, deltabias, deltac_weights = initialise_weights_changes(nn_structure)
        avg_cost = 0
        hidden_layer = {}
        delta = {}
        bp = []
        for i in range(len(y)):
            # perform the feed forward pass and return the stored h and z values, to be used in the gradient descent step
            hidden_layer, activations = feed_forward(X[i], weights, bias, c_weights, context, i, hidden_layer)
            bp.append(i)

            # loop from nl-1 to 1 backpropagating the errors
             if len(bp) == bptt:
                for j in reversed(bp):
                     delta[j] = {}
                    if j == bp[-1]:
                        for l in range(len(nn_structure), 0, -1):
                            if l == len(nn_structure):
                                delta[j][l] = calculate_out_layer_delta(y[j], hidden_layer[j][l]) 
                                avg_cost += mean_squared_error(y[j] , hidden_layer[j][l]) + 0.5 * reg * np.linalg.norm(np.power(weights[l - 1], 2)) + 0.5 * reg * np.linalg.norm(np.power(weights[l - 2], 2))
                            else:
                                if l > 1:
                                    delta[j][l] = calculate_hidden_delta(delta[j][l+1], weights[l]) * sigmoid_deriv(hidden_layer[j][l])

                    else:
                        for l in range(len(nn_structure), 0, -1):
                            if l == len(nn_structure):
                                delta[j][l] = calculate_out_layer_delta(y[j], hidden_layer[j][l])
                                avg_cost += np.linalg.norm((y[j] - hidden_layer[j][l]))
                            else:
                                if l > 1:
                                    delta[j][l] = calculate_hidden_delta(delta[j][l + 1], weights[l]) + calculate_hidden_delta(delta[j + 1][l], c_weights[1]) * sigmoid_deriv(hidden_layer[j][l])


                    for l in range(len(nn_structure) - 1, 0, -1):
                        deltaweights[l] = (-alpha * np.dot(delta[j][l + 1][:, np.newaxis], np.transpose(hidden_layer[j][l][:, np.newaxis]))) + (momentum * deltaweights[l]) + (reg / bptt * weights[l])
                        deltabias[l] = (-alpha * delta[j][l + 1]) + ((momentum * deltabias[l])) + (reg / bptt * bias[l])
                            if l == 1:
                               deltac_weights[l] += (-alpha * np.dot(delta[j][1 + 1][:, np.newaxis], np.transpose(context[j][:, np.newaxis]))) + (momentum * deltac_weights[l]) + (reg / bptt * c_weights[l])

                    # perform the gradient descent step for the weights in each layer
                    for l in range(len(nn_structure) - 1, 0, -1):
                         weights[l] += (1 / bptt * deltaweights[l]) - (reg / bptt * weights[l])
                         bias[l] += (1 / bptt * deltabias[l]) - (reg / bptt * bias[l])
                    if l == 1:
                        c_weights[l] += (1 / bptt * deltac_weights[l]) - (reg / bptt * c_weights[l])


                bp = []
                deltaweights, deltabias, deltac_weights = initialise_weights_changes(nn_structure)

        # complete the average cost calculation
        if cnt % 500 == 0:
            print(weights)
         avg_cost = 1.0 / m * avg_cost
        if cnt % 1000 == 0:
           print('Error', avg_cost)
        avg_cost_func.append(avg_cost)
        cnt += 1

        alpha = alpha - (alpha/iter_num)

    return weights, bias, avg_cost_func, c_weights, context

 def predict_y(weights, bias, X, c_weights, context):
     m = X.shape[0]
     y = np.zeros((m,))
     for i in range(m):
        hidden_layer = {1: X[i]}
        for l in range(1, len(weights) + 1):
            node_in = hidden_layer[l]
            if l == 1:
                activations = weights[l].dot(node_in) + c_weights[l].dot(context[l]) + bias[l]
            else:
                activations = weights[l].dot(node_in) + bias[l]
            hidden_layer[l + 1] = sigmoid(activations)
        y[i] = hidden_layer[3]
    return y

if __name__ == "__main__":
    # load data and scale
    filename = 'C:/Users/n0762538/Documents/Data/MackeyGlass/MackeyGlass.csv'


dataset, rownum = readcsv(filename)
#np.random.shuffle(dataset)

dataset = np.array(dataset)

# define data
no = int(0.70 * len(dataset))
train_data = dataset[0:no]
test_data = dataset[no:-1]
train_output = dataset[1:no + 1]
test_output = dataset[no + 1:]

X = train_data
y = train_output

# setup the NN structure
nn_structure = [len(dataset[0]), 3, len(dataset[0])]
# train the NN
weights, bias, avg_cost_func, c_weights, context = train_nn(nn_structure, X, y)
# plot the avg_cost_func
plt.plot(avg_cost_func)
plt.ylabel('Average J')
plt.xlabel('Iteration number')
plt.show()
# get the prediction accuracy and print

#print(weights)
#print('test:', test_output)

y_pred = predict_y(weights, bias, test_data, c_weights, context)
print('Prediction accuracy is {}%'.format(r2_score(test_output, y_pred) * 100))
#plt.plot(train_data)
plt.plot(y_pred)
plt.plot(test_output)

plt.title('Approach 1')
plt.ylabel('Predicted')
plt.xlabel('Iteration number')
plt.show()

Answer 1

根据结果，如果您的意思是错误率或某些评估指标改进。这里可能有一些准则。

让我们清楚地表明Error = 100 - Evaluation Metric。如果您的准确度为99％，则表示您的错误为1％（良好）。如果您的准确率是85％，那么您的错误是15％（不太好。它取消）

如果您的列车错误非常高（测试错误= 16％，列车错误= 15％）：您遇到BIAS问题。您需要获得具有更多层的BIGGER / DEEPER神经网络，使用梯度下降训练更多次迭代或仅尝试不同的神经网络架构（CNN，RNN）。

如果您的测试错误远远高于您的列车错误（测试错误= 11％，列车错误= 1％）：这意味着您有VARIANCE问题。您需要获得更多的训练数据，并应用正则化或尝试不同的神经网络架构（CNN，RNN）。

如果列车错误和测试错误都很高（列车错误：15％，测试错误：16％，那么你有HIGH BIAS和HIGH VARIANCE，你需要做我上面告诉你的两个。< / p>

要查看您是否存在BIAS问题，请查看“训练集性能”。要查看是否存在VARIANCE问题，请查看您的开发/测试集性能。更重要的是，您需要了解，贝叶斯错误或人员级别的表现。如果15％是贝叶斯错误，意味着NO MACHINE LEARNING ALGINGITHM可以做到比15％好，如果你的训练误差是16％，这意味着在修复BIAS方面几乎没有改进的余地。

我可以做些什么来改善线性输出的结果？

1 个答案: