我试图用SGD在pytorch中训练一个简单的多项式线性回归模型。我写了一些自包含(我认为是非常简单的代码),然而,由于某种原因,我的模型没有按照我的想法进行训练。
我从正弦曲线中采样5个点并尝试用4度多项式拟合它。这是一个凸问题,因此GD或SGD应该找到一个零列车误差的解决方案,只要我们有足够的迭代和足够小的步长。出于某种原因,我的模型不能很好地训练(即使它似乎正在改变模型的参数。任何人都知道为什么?这是代码(我试着让它自包含和最小):
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
import torch
from torch.autograd import Variable
from maps import NamedDict
from plotting_utils import *
def index_batch(X,batch_indices,dtype):
'''
returns the batch indexed/sliced batch
'''
if len(X.shape) == 1: # i.e. dimension (M,) just a vector
batch_xs = torch.FloatTensor(X[batch_indices]).type(dtype)
else:
batch_xs = torch.FloatTensor(X[batch_indices,:]).type(dtype)
return batch_xs
def get_batch2(X,Y,M,dtype):
'''
get batch for pytorch model
'''
# TODO fix and make it nicer, there is pytorch forum question
X,Y = X.data.numpy(), Y.data.numpy()
N = len(Y)
valid_indices = np.array( range(N) )
batch_indices = np.random.choice(valid_indices,size=M,replace=False)
batch_xs = index_batch(X,batch_indices,dtype)
batch_ys = index_batch(Y,batch_indices,dtype)
return Variable(batch_xs, requires_grad=False), Variable(batch_ys, requires_grad=False)
def get_sequential_lifted_mdl(nb_monomials,D_out, bias=False):
return torch.nn.Sequential(torch.nn.Linear(nb_monomials,D_out,bias=bias))
def train_SGD(mdl, M,eta,nb_iter,logging_freq ,dtype, X_train,Y_train):
##
N_train,_ = tuple( X_train.size() )
#print(N_train)
for i in range(nb_iter):
# Forward pass: compute predicted Y using operations on Variables
batch_xs, batch_ys = get_batch2(X_train,Y_train,M,dtype) # [M, D], [M, 1]
## FORWARD PASS
y_pred = mdl.forward(batch_xs)
## LOSS + Regularization
batch_loss = (1/M)*(y_pred - batch_ys).pow(2).sum()
## BACKARD PASS
batch_loss.backward() # Use autograd to compute the backward pass. Now w will have gradients
## SGD update
for W in mdl.parameters():
delta = eta*W.grad.data
W.data.copy_(W.data - delta)
## train stats
if i % (nb_iter/10) == 0 or i == 0:
current_train_loss = (1/N_train)*(mdl.forward(X_train) - Y_train).pow(2).sum().data.numpy()
print('i = {}, current_loss = {}'.format(i, current_train_loss ) )
## Manually zero the gradients after updating weights
mdl.zero_grad()
##
logging_freq = 100
dtype = torch.FloatTensor
## SGD params
M = 3
eta = 0.0002
nb_iter = 20*1000
##
lb,ub = 0,1
f_target = lambda x: np.sin(2*np.pi*x)
N_train = 5
X_train = np.linspace(lb,ub,N_train)
Y_train = f_target(X_train)
## degree of mdl
Degree_mdl = 4
## pseudo-inverse solution
c_pinv = np.polyfit( X_train, Y_train , Degree_mdl )[::-1]
## linear mdl to train with SGD
nb_terms = c_pinv.shape[0]
mdl_sgd = get_sequential_lifted_mdl(nb_monomials=nb_terms,D_out=1, bias=False)
## Make polynomial Kernel
poly_feat = PolynomialFeatures(degree=Degree_mdl)
Kern_train = poly_feat.fit_transform(X_train.reshape(N_train,1))
Kern_train_pt, Y_train_pt = Variable(torch.FloatTensor(Kern_train).type(dtype), requires_grad=False), Variable(torch.FloatTensor(Y_train).type(dtype), requires_grad=False)
train_SGD(mdl_sgd, M,eta,nb_iter,logging_freq ,dtype, Kern_train_pt,Y_train_pt)
错误似乎徘徊在2ish:
i = 0, current_loss = [ 2.08996224]
i = 2000, current_loss = [ 2.03536892]
i = 4000, current_loss = [ 2.02014995]
i = 6000, current_loss = [ 2.01307297]
i = 8000, current_loss = [ 2.01300406]
i = 10000, current_loss = [ 2.01125693]
i = 12000, current_loss = [ 2.01162267]
i = 14000, current_loss = [ 2.01296973]
i = 16000, current_loss = [ 2.00951076]
i = 18000, current_loss = [ 2.00967121]
这很奇怪,因为它应该能够达到零。
我还绘制了学习函数:
绘图的代码:
##
x_horizontal = np.linspace(lb,ub,1000).reshape(1000,1)
X_plot = poly_feat.fit_transform(x_horizontal)
X_plot_pytorch = Variable( torch.FloatTensor(X_plot), requires_grad=False)
##
fig1 = plt.figure()
#plots objs
p_sgd, = plt.plot(x_horizontal, [ float(f_val) for f_val in mdl_sgd.forward(X_plot_pytorch).data.numpy() ])
p_pinv, = plt.plot(x_horizontal, np.dot(X_plot,c_pinv))
p_data, = plt.plot(X_train,Y_train,'ro')
## legend
nb_terms = c_pinv.shape[0]
legend_mdl = f'SGD solution standard parametrization, number of monomials={nb_terms}, batch-size={M}, iterations={nb_iter}, step size={eta}'
plt.legend(
[p_sgd,p_pinv,p_data],
[legend_mdl,f'linear algebra soln, number of monomials={nb_terms}',f'data points = {N_train}']
)
##
plt.xlabel('x'), plt.ylabel('f(x)')
plt.show()
我实际上已经实现了TensorFlow版本。那似乎确实训练了模型。我尝试通过给它们进行相同的初始化来使它们都匹配:
mdl_sgd[0].weight.data.fill_(0)
但仍然没有奏效。 Tensorflow代码:
graph = tf.Graph()
with graph.as_default():
X = tf.placeholder(tf.float32, [None, nb_terms])
Y = tf.placeholder(tf.float32, [None,1])
w = tf.Variable( tf.zeros([nb_terms,1]) )
#w = tf.Variable( tf.truncated_normal([Degree_mdl,1],mean=0.0,stddev=1.0) )
#w = tf.Variable( 1000*tf.ones([Degree_mdl,1]) )
##
f = tf.matmul(X,w) # [N,1] = [N,D] x [D,1]
#loss = tf.reduce_sum(tf.square(Y - f))
loss = tf.reduce_sum( tf.reduce_mean(tf.square(Y-f), 0))
l2loss_tf = (1/N_train)*2*tf.nn.l2_loss(Y-f)
##
learning_rate = eta
#global_step = tf.Variable(0, trainable=False)
#learning_rate = tf.train.exponential_decay(learning_rate=eta, global_step=global_step,decay_steps=nb_iter/2, decay_rate=1, staircase=True)
train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)
with tf.Session(graph=graph) as sess:
Y_train = Y_train.reshape(N_train,1)
tf.global_variables_initializer().run()
# Train
for i in range(nb_iter):
#if i % (nb_iter/10) == 0:
if i % (nb_iter/10) == 0 or i == 0:
current_loss = sess.run(fetches=loss, feed_dict={X: Kern_train, Y: Y_train})
print(f'i = {i}, current_loss = {current_loss}')
## train
batch_xs, batch_ys = get_batch(Kern_train,Y_train,M)
sess.run(train_step, feed_dict={X: batch_xs, Y: batch_ys})
我也尝试过更改初始化,但它没有改变任何东西,这是有道理的,因为它不应该产生很大的不同:
mdl_sgd[0].weight.data.normal_(mean=0,std=0.001)
原帖:
这应该是这样的:
解决方案:
似乎结果是作为向量而不是导致问题的数字返回的问题。即以下代码固定的东西:
y_pred = model(batch_xs).view(-1) # change this to "y_pred = model(batch_xs)" to get the incorrect results
loss = (y_pred - batch_ys).pow(2).mean()
这对我来说似乎完全神秘。有人知道为什么这解决了这个问题吗?它看起来很神奇。
答案 0 :(得分:1)
这个错误非常微妙,但实质上是因为pytorch正在使用numpy广播规则。因此,当列向量(3,1)
和数组(即dim为(3,)
)时,会发生的是广播产生(3,3)
矩阵(请注意,当您减去a时,这不会发生行向量(1,3)
向量带有(3,)
数组,我猜数组被视为行向量)。这非常糟糕,因为这意味着我们计算每个标签和每个预测之间所有成对差异的矩阵。当然,这是荒谬的,并且会产生错误,因为我们不希望第一个标签点的预测与数据集中每个其他标签的预测相匹配。当然,这不会产生任何合理的结果。
因此,答案似乎只是为了避免错误的numpy广播,无论是在培训期间还是在数据输入之前重塑事物。任何一个都应该工作。
为了避免可以附加的错误,请使用以下代码:
def check_vectors_have_same_dimensions(Y,Y_):
'''
Checks that vector Y and Y_ have the same dimensions. If they don't
then there might be an error that could be caused due to wrong broadcasting.
'''
DY = tuple( Y.size() )
DY_ = tuple( Y_.size() )
if len(DY) != len(DY_):
return True
for i in range(len(DY)):
if DY[i] != DY_[i]:
return True
return False