我使用逻辑后退法对医院死亡率问题进行分类。我从scatch用Python实现LR,并使用SGD查找最佳模型。当我用zeors初始化参数--- W时,它的性能很好,当我随机地初始化W时,它的性能最差。我调整了学习速度,但是仍然无法提高,它在理论上没有意义,因为使用l2正则化器的逻辑损失是凸函数,并且SGD可以找到最佳点,尽管初始点是随机的。谁能给我一个解释?这将对我有很大帮助。谢谢。
''' Python
import os
import csv
import numpy as np
import sys
import time
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.preprocessing.data import StandardScaler, OneHotEncoder
home_dir = os.path.split(os.path.realpath(__file__))[0]
sys.path.append(os.path.join(home_dir))
class Arguments():
def __init__(self):
self.seed = 3
self.n_local = 5
self.batch_size = 64
self.lr = 5.0
self.lbda = 0.00001
self.epochs = 20
self.epsilon = 1e-7
args = Arguments()
def load_data(path):
x_train = np.load(path + 'x_train.npy')
x_test = np.load(path + 'x_test.npy')
y_train = np.load(path + 'y_train.npy')
y_test = np.load(path + 'y_test.npy')
x_val = np.load(path + 'x_val.npy')
y_val = np.load(path + 'y_val.npy')
x_train = np.concatenate((x_train, x_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)
y_train = np.expand_dims(y_train,1)
# y_test = np.expand_dims(y_test, 1)
return x_train, x_test, y_train, y_test
def sigmoid(x):
return np.exp(np.fmin(x, 0)) / (1.0 + np.exp(-np.abs(x)))
def cal_grad(w, x, y, lbda):
# x: shape(N, feature)
# y: shape(N, 1)
# w: shape(feature, 1)
w_x_mul = np.dot(x, w)
# h_w = 1.0 / (1 + np.exp(-w_x_mul))
h_w = sigmoid(w_x_mul)
d = y - h_w
grad = np.dot(x.T, d) / x.shape[0] + lbda * w
loss = -sum(y*np.log(h_w + args.epsilon) + (1-y)*np.log(1-h_w + args.epsilon)) / x.shape[0] + lbda * sum(w ** 2) / 2.0
return grad, loss
def cal_auc_acc(w, x_test, y_test):
w_x_mul = np.dot(x_test, w)
# probaOfPositive = 1.0 / (1.0 + np.exp(-w_x_mul))
probaOfPositive = sigmoid(w_x_mul)
probaOfNegative = 1.0 - probaOfPositive
proba = np.hstack((probaOfNegative, probaOfPositive))
y_pred = np.argmax(proba, axis=1)
acc = sum(y_pred == y_test) / float(y_test.shape[0])
auc = metrics.roc_auc_score(y_test, probaOfPositive)
confusionMatrix = metrics.confusion_matrix(y_test, y_pred)
return acc, auc, confusionMatrix
def glorot_normal(fan_in, fan_out):
stddev = np.sqrt(2 / (fan_in + fan_out))
return np.random.normal(0, stddev, (fan_in, fan_out))
def train(x, y, x_test, y_tesy, lbda):
n_samples, features = x.shape[0], x.shape[1]
# w = np.ones((features, 1)) * 5
# w = np.zeros((features, 1))
w = glorot_normal(features, 1)
n_batchs = int(n_samples / args.batch_size)
# np.random.seed(args.seed)
rng = np.random.RandomState(args.seed)
indexes = np.arange(n_samples)
rng.shuffle(indexes)
acc_li = []
auc_li = []
grad_norm = []
loss_li = []
init_grad, init_loss = cal_grad(w, x_train, y_train, lbda)
print("initial loss: {}, init grad norm: {}".format(init_loss, sum(init_grad ** 2)))
init_acc, init_auc, init_cf = cal_auc_acc(w, x_test, y_test)
print("init acc: {}, init_auc: {}, init confusion matrix: {}".format(init_acc, init_auc, init_cf))
acc_li.append(init_acc)
auc_li.append(init_auc)
grad_norm.append(sum(init_grad ** 2))
loss_li.append(init_loss)
step = 0
base = args.lr
for e in range(args.epochs):
lssum = 0
for i_batch in range(n_batchs):
if e == 0 and step <= 135:
args.lr = base / (e*n_batchs + i_batch + 1)
else:
args.lr = 5e-2
cur_indexes = indexes[i_batch*args.batch_size:min((i_batch+1)*args.batch_size, n_samples)]
x_batch = x[cur_indexes]
y_batch = y[cur_indexes]
grad, loss = cal_grad(w, x_batch, y_batch, args.lbda)
w = w + args.lr * grad
lssum += loss
step += 1
if step % 5 == 0:
acc, auc, cf = cal_auc_acc(w, x_test, y_test)
total_grad, total_loss = cal_grad(w, x_train, y_train, args.lbda)
acc_li.append(acc)
auc_li.append(auc)
grad_norm.append(sum(total_grad ** 2))
loss_li.append(total_loss)
'''