Question

我使用逻辑后退法对医院死亡率问题进行分类。我从scatch用Python实现LR，并使用SGD查找最佳模型。当我用zeors初始化参数--- W时，它的性能很好，当我随机地初始化W时，它的性能最差。我调整了学习速度，但是仍然无法提高，它在理论上没有意义，因为使用l2正则化器的逻辑损失是凸函数，并且SGD可以找到最佳点，尽管初始点是随机的。谁能给我一个解释？这将对我有很大帮助。谢谢。

''' Python
import os
import csv
import numpy as np
import sys
import time
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.preprocessing.data import StandardScaler, OneHotEncoder
home_dir = os.path.split(os.path.realpath(__file__))[0]
sys.path.append(os.path.join(home_dir))

class Arguments():
    def __init__(self):
        self.seed = 3
        self.n_local = 5
        self.batch_size = 64
        self.lr = 5.0
        self.lbda = 0.00001
        self.epochs = 20
        self.epsilon = 1e-7

args = Arguments()

def load_data(path):

    x_train = np.load(path + 'x_train.npy')
    x_test = np.load(path + 'x_test.npy')
    y_train = np.load(path + 'y_train.npy')
    y_test = np.load(path + 'y_test.npy')
    x_val = np.load(path + 'x_val.npy')
    y_val = np.load(path + 'y_val.npy')

    x_train = np.concatenate((x_train, x_val), axis=0)
    y_train = np.concatenate((y_train, y_val), axis=0)

    y_train = np.expand_dims(y_train,1)
    # y_test = np.expand_dims(y_test, 1)

    return x_train, x_test, y_train, y_test

def sigmoid(x):

    return np.exp(np.fmin(x, 0)) / (1.0 + np.exp(-np.abs(x)))

def cal_grad(w, x, y, lbda):

    # x: shape(N, feature)
    # y: shape(N, 1)
    # w: shape(feature, 1)

    w_x_mul = np.dot(x, w)
    # h_w = 1.0 / (1 + np.exp(-w_x_mul))
    h_w = sigmoid(w_x_mul)
    d = y - h_w
    grad = np.dot(x.T, d) / x.shape[0] + lbda * w
    loss = -sum(y*np.log(h_w + args.epsilon) + (1-y)*np.log(1-h_w +         args.epsilon)) / x.shape[0] + lbda * sum(w ** 2) / 2.0
    return grad, loss

def cal_auc_acc(w, x_test, y_test):

    w_x_mul = np.dot(x_test, w)
    # probaOfPositive = 1.0 / (1.0 + np.exp(-w_x_mul))
    probaOfPositive = sigmoid(w_x_mul)
    probaOfNegative = 1.0 - probaOfPositive
    proba = np.hstack((probaOfNegative, probaOfPositive))
    y_pred = np.argmax(proba, axis=1)
    acc = sum(y_pred == y_test) / float(y_test.shape[0])
    auc = metrics.roc_auc_score(y_test, probaOfPositive)
    confusionMatrix = metrics.confusion_matrix(y_test, y_pred)
    return acc, auc, confusionMatrix

def glorot_normal(fan_in, fan_out):
    stddev = np.sqrt(2 / (fan_in + fan_out))
    return np.random.normal(0, stddev, (fan_in, fan_out))

def train(x, y, x_test, y_tesy, lbda):

    n_samples, features = x.shape[0], x.shape[1]

    # w = np.ones((features, 1)) * 5
    # w = np.zeros((features, 1))
    w = glorot_normal(features, 1)
    n_batchs = int(n_samples / args.batch_size)

    # np.random.seed(args.seed)
    rng = np.random.RandomState(args.seed)
    indexes = np.arange(n_samples)
    rng.shuffle(indexes)

    acc_li = []
    auc_li = []
    grad_norm = []
    loss_li = []

    init_grad, init_loss = cal_grad(w, x_train, y_train, lbda)
    print("initial loss: {}, init grad norm: {}".format(init_loss, sum(init_grad ** 2)))
    init_acc, init_auc, init_cf = cal_auc_acc(w, x_test, y_test)
    print("init acc: {}, init_auc: {}, init confusion matrix: {}".format(init_acc, init_auc, init_cf))

    acc_li.append(init_acc)
    auc_li.append(init_auc)
    grad_norm.append(sum(init_grad ** 2))
    loss_li.append(init_loss)


    step = 0
    base = args.lr
    for e in range(args.epochs):
        lssum = 0
        for i_batch in range(n_batchs):
            if e == 0 and step <= 135:
                args.lr = base / (e*n_batchs + i_batch + 1) 
            else:
                args.lr = 5e-2
            cur_indexes = indexes[i_batch*args.batch_size:min((i_batch+1)*args.batch_size, n_samples)]
        x_batch = x[cur_indexes]
        y_batch = y[cur_indexes]
            grad, loss = cal_grad(w, x_batch, y_batch, args.lbda)
            w = w + args.lr * grad
            lssum += loss
            step += 1
            if step % 5 == 0:
                acc, auc, cf = cal_auc_acc(w, x_test, y_test)
                total_grad, total_loss = cal_grad(w, x_train, y_train, args.lbda)
                acc_li.append(acc)
                auc_li.append(auc)
                grad_norm.append(sum(total_grad ** 2))
                loss_li.append(total_loss)

'''

随机初始点影响Logistic回归的性能

0 个答案: