我用Python写了一个非常简单的逻辑回归实现,但是由于某些原因,损失函数并未减少。这是我的实现:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
class LogisticRegression:
def __init__(self, input_size, alpha, std, learning_rate, epsilon, num_epochs):
self.W = std * np.random.randn(input_size + 1)
self.alpha = alpha
self.learning_rate = learning_rate
self.epsilon = epsilon
self.num_epochs = num_epochs
def fit(self, X, y):
X = np.append(X, np.ones((X.shape[0], 1)), axis=1) # bias term
N, D = X.shape
self.loss_hist_ = []
for epoch in range(self.num_epochs):
loss, loss_grads = self.loss(X, y)
self.loss_hist_.append(loss)
print('Epoch', epoch+1, 'loss', loss)
if loss < self.epsilon:
break
self.W -= self.learning_rate * loss_grads
return self
def predict(self, X):
X = np.append(X, np.ones((X.shape[0], 1)), axis=1) # bias term
probs = self.sigmoid_Wx(X)
y_pred = (probs >= 0.5)
return probs, y_pred
def sigmoid_Wx(self, X):
scores = np.matmul(X, self.W) # (N, D) x (D,) = (N,), one score per datapoint
return 1 / (1 + np.exp(-scores))
def loss(self, X, y):
N, D = X.shape
probs = self.sigmoid_Wx(X)
data_loss = (-1 / N) * sum(y * np.log(probs) + (1 - y) * np.log(1. - probs))
reg_loss = self.alpha * self.W.dot(self.W)
loss = data_loss + reg_loss
loss_grads = (-1 / N) * np.matmul(X.T, probs - y) # (D, N) x (N,) = (D,)
loss_grads += 2*self.alpha*self.W
return loss, loss_grads
DATA_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', '..', 'Data'))
FIGURES_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'Figures'))
def main():
X_train, y_train, X_val, y_val = map(lambda fn: pd.read_csv(os.path.join(DATA_DIR, fn)), ['X_train.csv', 'Y_train.csv', 'X_val.csv', 'Y_val.csv'])
X_train = X_train['Review Text'] # ignore all feats besides review text
X_val = X_val['Review Text']
y_train = (y_train['Sentiment'] == 'Positive').values.astype(int) # binarize labels
y_val = (y_val['Sentiment'] == 'Positive').values.astype(int)
print('Transforming into bow representation')
vocab = sorted(CountVectorizer().fit(X_train).vocabulary_.keys())
# todo: don't use dense arrays here
X_train = CountVectorizer(vocabulary=vocab).fit_transform(X_train).toarray()
X_val = CountVectorizer(vocabulary=vocab).fit_transform(X_val).toarray()
lr = LogisticRegression(input_size=len(vocab), alpha=0., std=1e-4, learning_rate=1e-5, epsilon=1e-8, num_epochs=10)
print('Fitting training set')
lr.fit(X_train, y_train)
print('Making predictions for validation set')
y_val_probs, y_val_pred = lr.predict(X_val)
print(roc_auc_score(y_val, y_val_probs))
if __name__ == '__main__':
main()
这是我得到的输出:
$ py -3 LR_Template.py
Transforming into bow representation
Fitting training set
Epoch 1 loss 0.6935215358862573
Epoch 2 loss 0.6935650560886688
Epoch 3 loss 0.6936085850502569
Epoch 4 loss 0.69365212277282
Epoch 5 loss 0.6936956692581194
Epoch 6 loss 0.6937392245079684
Epoch 7 loss 0.6937827885241183
Epoch 8 loss 0.6938263613083845
Epoch 9 loss 0.6938699428625258
Epoch 10 loss 0.6939135331883654
Making predictions for validation set
0.5702245722219217
我不确定我在哪里错了?据我所知,所有的数学运算都是正确的。