2D RBF神经元中的收敛问题实现为Keras层

时间:2017-06-01 16:21:58

标签: keras keras-layer

我们在Keras实施了一个2D高斯径向基础层(RBF),并且遇到了批量大于1的收敛问题.Neuron应该实现以下功能:

F(X,Y)= EXP(-a((X-X_0)2 +(Y-y_0)²)

这里x_0,y_0和a是拟合参数。

测试用例

目前我们正在进行正确性测试,并试图在上面的2D功能上只安装一个神经元。神经元应该(并且在batch_size 1的情况下)能够精确地近似该函数。最佳损失为0。

问题

如果我们在此代码中选择批量大小为1,则使用Keras的预测将非常频繁地收敛,并且几乎与起始参数无关。 如果我们增加批量大小,那么拟合可能会产生随机游走,冻结或根本不收敛。在所有这些情况下(甚至是batch_size 2),收敛比batch_size 1情况要糟糕得多。如果我们选择batch_size作为训练集的大小(即1296,我们想要的批量大小),那么拟合将在大部分时间内冻结,大部分与学习率无关。

代码

我们在以下代码中实现了这一层:

# 2D RBF Layer
# In case anybody wants to use this code afterwards:
# Licenses: Apache, MIT, BSD, LGPLv2 and v3 and Public Domain
# Input: x,y Pairs, shape: (2,)
# Output: exp(a* ((x-x_0)**2 + (y-y_0)**2)), shape: (1,)
# Parameters: x_0, y_0, a - called: mean_x, mean_y and opening in the following code:
# x and y should both lie in [0,1] - only [0,infinity] is enforced currently
class RBFLayer2D(Layer):
    def __init__(self,  **kwargs):
        super(RBFLayer2D, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.mean_x = K.variable(0.35)
        self.constraints[self.mean_x] = NonNeg()
        self.mean_y = K.variable(0.35)
        self.constraints[self.mean_y] = NonNeg()
        self.opening = K.variable(2.0)
        self.constraints[self.opening] = NonNeg()
        self.trainable_weights = [self.mean_x,self.mean_y,self.opening]
        super(RBFLayer2D, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, x):
        x_m = x[:,0] - self.mean_x
        y_m = x[:,1] - self.mean_y
        out = x_m*x_m + y_m*y_m
        outexp = 50.0*K.exp(-64.8*self.opening*out)
        # Output: exp(-a* ((x-x_0)**2 + (y-y_0)**2))
        return outexp

    def compute_output_shape(self, input_shape):
        # If Inputshape is (None, N) Outputshape is (None,N/2)
        # In our example we only look at (None, 2), which outputs (None,1)
        output_shape = (input_shape[0], input_shape[1]//2)
        return output_shape

生殖

要重现,请在本节后面的(不那么)最小示例中设置batch_size为1。当你运行它时,代码将显示目标分布(左下角的一个圆圈),我们的RBF ANN的起始猜测(中间较小的圆圈)然后在每次迭代后当前猜测(一个圆圈变大)并移动到左下角。) 然后将batch_size设置为12并重新启动代码,您将不再观察到收敛。

最小示例

from __future__ import print_function
from __future__ import division
import numpy as np
np.random.seed(1234)
import matplotlib.pyplot as plt
from keras.engine import Layer
from keras.optimizers import SGD
from keras.models import Sequential
from keras.constraints import NonNeg
from keras import backend as K

# 2D RBF Layer
# Input: x,y Pairs, shape: (2,)
# Output: exp(a* ((x-x_0)**2 + (y-y_0)**2)), shape: (1,)
# Parameters: x_0, y_0, a - called: mean_x, mean_y and opening in the following code:
# x and y should both lie in [0,1] - only [0,infinity] is enforced currently
class RBFLayer2D(Layer):
    def __init__(self,  **kwargs):
        super(RBFLayer2D, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.mean_x = K.variable(0.35)
        self.constraints[self.mean_x] = NonNeg()
        self.mean_y = K.variable(0.35)
        self.constraints[self.mean_y] = NonNeg()
        self.opening = K.variable(2.0)
        self.constraints[self.opening] = NonNeg()
        self.trainable_weights = [self.mean_x,self.mean_y,self.opening]
        super(RBFLayer2D, self).build(input_shape)

    def call(self, x):
        x_m = x[:,0] - self.mean_x
        y_m = x[:,1] - self.mean_y
        out = x_m*x_m + y_m*y_m
        outexp = 50.0*K.exp(-64.8*self.opening*out)
        # Output: exp(-a* ((x-x_0)**2 + (y-y_0)**2))
        return outexp

    def compute_output_shape(self, input_shape):
        # If Inputshape is (None, N) Outputshape is (None,N/2)
        # In our example we only look at (None, 2), which outputs (None,1)
        output_shape = (input_shape[0], input_shape[1]//2)
        return output_shape

# The function we want to train.
# It can be exactly represented using a single Neuron.
def twodenergy(phi, psi):
    r0 = np.array([-180, -180])
    b = 0.00005
    return 50.0 * np.exp(- b * ((phi - r0[0]) ** 2 + (psi - r0[1]) ** 2))

# One of two plotting helper functions to show the results
def make_plot(y,numsteps,numbins,minangle,maxangle,plotnum, batch_size):
    evaluation = np.zeros((numsteps, numsteps))
    for i in range(0, numbins):
        mx = i % numsteps
        my = int(i / numsteps)
        evaluation[mx,my]=y[i]

    plt.imshow(evaluation.T, origin='lower',extent=[minangle, maxangle, minangle, maxangle])
    plt.xlabel("x")
    plt.ylabel("y")

    if plotnum == 0:       
        plt.title("Startconfiguration")
    else:
        plt.title("RBF for batch_size %i at frame %03d" % (batch_size, plotnum))

    plt.show()

# One of two plotting helper functions to show the target function
def plot_target_function(phi, psi, minangle, maxangle, delta_angle_half, numbins, numsteps ):
    eval_matrix_corr = np.zeros((numsteps, numsteps))
    for i in range(0, numbins):
        mx = i % numsteps
        my = int(i / numsteps)
        ph = phi[mx] +delta_angle_half
        ps = psi[my] +delta_angle_half
        eval_matrix_corr[mx,my] = twodenergy(ph,ps)

    plt.imshow(eval_matrix_corr.T, origin='lower', extent=[minangle, maxangle, minangle, maxangle])
    plt.title("Target Function")
    plt.xlabel("phi")
    plt.ylabel("psi")
    plt.show()


if __name__ == "__main__":
    # batch_size ==    1:     converges very often nearly independent of input parameters
    # batch_size ==    2:     no to slow convergence, but distribution stays in the right place more or less
    # batch_size == 3-12:     random walk
    # batch_size == 1296:     no movement in case of low learning_rate, random_walk in case of high learning_rate
    #                         (this is the case where the whole map is evaluated in every step.
    #                         1296 is our desired testcase, because it evaluates the whole map we want to fit.
    batch_size = 1
    learning_rate = 1E-5

    ### Here we generate the target function ###
    ### f(phi,psi)
    ### phi is [-180,180]
    ### psi is [-180,180]
    anglestep = 10.0
    minangle = -180.0
    maxangle = 180.0
    numsteps = int((maxangle - minangle)/anglestep)
    anglerange = maxangle - minangle
    numbins = numsteps*numsteps
    delta_angle_half = anglerange /(2.0* numsteps)

    phi = np.arange(minangle, maxangle, anglestep)
    psi = np.arange(minangle, maxangle, anglestep)

    #Target Function Plot, Gaussian in lower left
    plot_target_function(phi, psi, minangle, maxangle, delta_angle_half, numbins, numsteps )


    # Input Parameter Regularization
    # we map -180..180 to 0..1
    # we also calculate the training parameters for our x,y pairs:
    x_train = np.zeros((numbins, 2))
    y_train = np.zeros((numbins, 1))
    for x,ph in enumerate(phi):
        for y,ps in enumerate(psi):
            myphi = (ph + delta_angle_half - minangle)/(anglerange)
            mypsi = (ps + delta_angle_half- minangle)/(anglerange)

            x_train[x * numsteps + y, 0] = (ph +delta_angle_half - minangle)/(anglerange)
            x_train[x * numsteps + y, 1] = (ps + delta_angle_half- minangle)/(anglerange)
            y_train[x * numsteps + y] = twodenergy(ph +delta_angle_half,ps +delta_angle_half)

    # Prediction with Keras
    model = Sequential()
    # Single RBF Layer, only one node
    model.add(RBFLayer2D(input_shape=(2,)))
    sgd = SGD(lr=learning_rate)
    model.compile(loss="mean_squared_error", optimizer=sgd)

    # We plot the starting configuration.
    y = model.predict(x_train, batch_size=batch_size)
    make_plot(y, numsteps, numbins, minangle, maxangle, 0, batch_size)

    #Plot the first 15 iterations:
    for i in range(0,15):
        # For demonstration purposes, we fit 1 epoch and plot the output.  
        model.fit(x_train,y_train, epochs=1, batch_size=batch_size)
        y = model.predict(x_train, batch_size=batch_size)
        make_plot(y, numsteps, numbins, minangle, maxangle, 1 + i, batch_size)

0 个答案:

没有答案