Question

我正在尝试自己实现一个简单的神经网络来对点进行分类。我听说过我感兴趣的特定类型的激活函数，高斯。我不只是想使用relus或sigmoids，我正在尝试构建一个网络，其中输入大约300 x和y值，然后在第一层计算这些值上的高斯函数，大约有50个神经元，每个神经元都有一个单独的x和y值作为它们的平均值（我将保持sigma不变）。在数学上我预计这看起来像

exp(- [(x-Mx)^2 + (y-My)^2] / (2 * sigma^2) ) / (sqrt(2*pi*sigma))

然后我将对第一层中的所有神经元执行这些术语的加权和，添加偏差，并将其传递给sigmoid以获得我的预测。我将为每个训练示例执行此步骤并获取预测列表。我认为我会进行前向传播，但我会包含代码，以防有人在我的实现中发现明显的错误。然后我执行反向传播。我已经测试了我对权重和偏见的更新，我相信它们不是问题所在。我认为我对平均值的实现存在一些问题，因为它们总是聚集到单个点，这显然不会使成本函数最大化。我已经尝试过使用几个不同的数据集，并改变一些超级参数，但都无济于事。谁能弄明白问题是什么？这是我的代码。

# libraries
import matplotlib.patches as patches
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pdb


# functions

def gaussian(sq_error, sigma):
    return ((1/np.sqrt(2*np.pi*sigma**2))) * np.exp(-(sq_error)/(2*sigma**2))

def calc_X1(X0, Mx, My, m, sigma):
    X1 = [] # shape will be (10, m)
    for ex in range(0, m):
        sq_error = (X0[0][ex] - Mx) **2 + (X0[1][ex] - My) **2
        X1.append(gaussian(sq_error, sigma))
    X1 = np.array(X1)
    return X1.T

def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def calc_X2(W2, X1, b2):
    return sigmoid(np.dot(W2, X1) + b2)

def cost(X2, Y, m):
    return -1/m * ( np.dot(Y, np.log(X2.T)) + np.dot(1-Y, np.log(1-X2.T))) [0] 

def calc_dZ2(X2, Y):
    return X2 - Y

def calc_dM(dZ2, W2, X1, sigma, M, m, xOrY, X0):
    cur_dM = np.zeros(M.shape)
    for i in range(0, m):
        # pdb.set_trace()
        cur_dM += dZ2[0][i] * float(np.dot(W2, X1.T[i])) * 1/sigma**2 * (X0[xOrY][i] - M)
    return cur_dM / m


def train_correct(X2, Y, m):
    ct = 0
    for i in range(0, m):
        if np.round(X2[0][i]) == Y[i]:
            ct += 1
    return ct / m


# graphing functions
def plot_train_data(X, Y, m, ax):
    for ex in range(0, m):
        xCur = X[0][ex]
        yCur = X[1][ex]
        if Y[ex] == 1:
            color=(1, 0, 0)
        else:
            color=(0,0,1)
        ax.scatter(xCur, yCur, c=color)

def probability_hash(pr):
    return (float(pr), float(np.round(pr)), float(1-pr))

def probability_hash_1d(pr):
    return float(pr)

def plot_boundary(Mx, My, sigma, W2, b2, ax):
    boundsx = [-5, 5]
    boundsy = [-5, 5]

    samples = [10, 10]

    width = (boundsx[1] - boundsx[0]) / samples[0]
    height = (boundsy[1] - boundsy[0]) / samples[1] 

    pt = np.zeros((2,1))
    for x in np.linspace(boundsx[0], boundsx[1], samples[0]):
        for y in np.linspace(boundsy[0], boundsy[1], samples[1]):
            pt[0][0] = x
            pt[1][0] = y
            X1_cur = calc_X1(pt, Mx, My, 1, sigma)
            X2_cur = calc_X2(W2, X1_cur, b2)
            # ax.add_patch(patches.Rectangle((x, y), width, height, facecolor=probability_hash(X2_cur)))
            ax.scatter(x, y, c=probability_hash(X2_cur))


def cool_plot_boundary(Mx, My, sigma, W2, b2, ax):
    boundsx = [-2, 2]
    boundsy = [-2, 2]

    samples = [50, 50]

    width = (boundsx[1] - boundsx[0]) / samples[0]
    height = (boundsy[1] - boundsy[0]) / samples[1] 

    pt = np.zeros((2,1))

    heats = []

    xs = np.linspace(boundsx[0], boundsx[1], samples[0])
    ys = np.linspace(boundsy[0], boundsy[1], samples[1])

    for x in xs:
        heats.append([])
        for y in ys:
            pt[0][0] = x
            pt[1][0] = y
            X1_cur = calc_X1(pt, Mx, My, 1, sigma)
            X2_cur = calc_X2(W2, X1_cur, b2)
            heats[-1].append(probability_hash_1d(X2_cur))

    # xticks = []
    # yticks = []
    # for i in range(0, len(xs)):
    #   if i % 3 == 0:
    #       xticks.append(round(xs[i], 2))
    # for i in range(0, len(ys)):
    #   if i % 3 == 0:
    #       yticks.append(round(ys[i], 2))

    xticks = []
    yticks = []

    sns.heatmap(heats, ax=ax, cbar=True, xticklabels=xticks, yticklabels=yticks)

def plot_m(Mx, My, n1, ax):
    for i in range(0, n1):
        ax.scatter(Mx[i], My[i], c="k")


# initialize parameters
file = "data/disk2.csv"
df = pd.read_csv(file)

sigma = 2
itterations = 10000
learning_rate = 0.9

n0 = 2  # DO NOT CHANGE, formality
X0 = np.row_stack((df["0"], df["1"]))  # shape is (2, m)
Y = np.array(df["2"])

m = len(Y)

n1 = 50
Mx = np.random.randn(n1)
My = np.random.randn(n1)
X1 = calc_X1(X0, Mx, My, m, sigma)

n2 = 1  # DO NOT CHANGE, formality
small_number = 0.01
W2 = np.random.randn(1, n1) * small_number
b2 = 0
X2 = calc_X2(W2, X1, b2)

J = cost(X2, Y, m)
Js = []
itters = []

fig = plt.figure()
plotGap = 200


for i in range(0, itterations):
    # forward propogation
    X1 = calc_X1(X0, Mx, My, m, sigma)
    X2 = calc_X2(W2, X1, b2)

    J = cost(X2, Y, m)

    if i % plotGap == 0:
        fig.clear()
        costAx = fig.add_subplot(311)
        plotAx = fig.add_subplot(312)
        pointsAx = fig.add_subplot(313)
        cool_plot_boundary(Mx, My, sigma, W2, b2, plotAx)
        # plot_boundary(Mx, My, sigma, W2, b2, plotAx)
        plot_train_data(X0, Y, m, pointsAx)
        Js.append(J)
        itters.append(i)
        costAx.plot(itters, Js, c="k")
        print("cost = " + str(J) + "\ttraining correct = " + str(train_correct(X2, Y, m)))
        plot_m(Mx, My, n1, pointsAx)
        plt.pause(0.1)

    # back propogation

    dZ2 = calc_dZ2(X2, Y)
    dW2 = np.dot(dZ2, X1.T) / m
    db2 = np.sum(dZ2) / m
    dMx = calc_dM(dZ2, W2, X1, sigma, Mx, m, 0, X0)
    dMy = calc_dM(dZ2, W2, X1, sigma, My, m, 1, X0)

    b2 -= learning_rate * db2
    W2 -= learning_rate * dW2
    Mx -= learning_rate * dMx
    My -= learning_rate * dMy

对于数据，我有一个带有一堆点位置和标签的csv。您可以使用此代码生成类似的csv。（确保在运行此文件夹的文件夹中有一个名为data的文件夹。）

# makes data in R2 to learn

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

n = 2
# number of exaples
m = 300

X = []
Y = []


# hyperparamers for data
rApprox = 1
error = 0.4
noise = 0.1

name = "data/disk2"

plt.cla()
for ex in range(0, m):
    xCur = np.random.randn(2)
    X.append(xCur)
    if abs(np.linalg.norm(xCur) + np.random.randn()*noise - rApprox) < error:
        Y.append(1)
        color="r"
    else:
        Y.append(0)
        color="b"
    plt.scatter(xCur[0], xCur[1], c=color)
    if abs(np.random.randn()) < 0.01:
        plt.pause(0.1)
plt.pause(1)
plt.savefig(name + ".png")

X = np.array(X)
Y = np.array(Y)

df = pd.DataFrame(X)
df[2] = Y
df.to_csv(name + ".csv", index=False)

感谢您的帮助。

Answer 1

将此函数替换为calculate dm函数。在乘法时必须小心，维度不仅仅是足够的。

def calculuate_dMs(X0, X1, X2, Mx, My, W2, dZ2, sigma, m, n1):
    # pdb.set_trace()
    X0x_big = np.dot(np.ones((n1, 1)), X0[0].reshape(1, m))
    X0y_big = np.dot(np.ones((n1, 1)), X0[1].reshape(1, m))
    Mx_big =  np.dot(Mx.reshape(n1, 1), np.ones((1, m)))
    My_big =  np.dot(My.reshape(n1, 1), np.ones((1, m)))
    W2_big =  np.dot(W2.reshape(n1, 1), np.ones((1, m)))
    dZ2_big =  np.dot(np.ones((n1, 1)), dZ2.reshape(1, m))

    dxTemp = np.multiply(np.multiply(np.multiply((X0x_big - Mx_big), X1), W2_big), dZ2_big)
    dyTemp = np.multiply(np.multiply(np.multiply((X0y_big - My_big), X1), W2_big), dZ2_big)

    return (np.sum(dxTemp, axis=1)/m, np.sum(dyTemp, axis=1)/m)

径向基础网络衍生物正在共同推动

1 个答案: