Question

我一直在尝试使用python使用正则化来实现Andrew Ng的Logistic回归，但是要对其进行优化以获取正确的精度却很难。在文档中提到，当lambda = 1时，精度应为83％，但是我的精度为64％。

如果有人能指出如何提高准确性，将不胜感激！

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from sklearn.preprocessing import PolynomialFeatures

#Import Dataset
dataset = pd.read_csv("Microchip Test Dataset.txt", names=["Test 1", "Test 2", "Accepted"])
print(dataset.head())

positive = dataset[dataset["Accepted"] == 1]
negative = dataset[dataset["Accepted"] == 0]


#Visualizing Dataset
plt.scatter(positive["Test 1"], positive["Test 2"], color="red", marker="o", label="Accepted")
plt.scatter(negative["Test 1"], negative["Test 2"], color="blue", marker="x", label="Rejected")
plt.title("Microchip Test")
plt.xlabel("Test 1")
plt.ylabel("Test 2")
plt.legend()
#plt.show()

#Preprocessing Data
col = len(dataset.columns)
x1 = dataset["Test 1"].values
x2 = dataset["Test 2"].values
dataset.insert(col, "Ones", 1)

#Creating Polynomial Features MANUALLY
degree = 5
    for i in range(1, degree):
        for j in range(i):
            dataset["P"+ str(i)+ str(j)] = np.power(x1, i-j) * 
np.power(x2,j)

dataset.drop(columns=["Test 1", "Test 2"], inplace=True)

print("\nModified Dataset")
print(dataset.head())

col = len(dataset.columns)
x = dataset.iloc[:,1:col].values
y = dataset.iloc[:,0:1].values
b = np.zeros(11)
m = len(x)
l = 1
count = 0

print(f"\nX Shape : {x.shape}   Y Shape : {y.shape}   B Shape : {b.shape}")

#Defining Functions
def sigmoid(z):
    h = 1 / (1 + np.exp(-z))
    return h

def cost(b, x, y, l):
    first = y.T.dot(np.log(sigmoid(x.dot(b.T))))
    second = (1-y).T.dot(np.log((1 - sigmoid(x.dot(b.T)))))
    reg_parameter = (l/(2*m)) * np.sum(b[1:] ** 2)
    j = (-1/m)*np.sum(first+second) + reg_parameter
    return j

def gradient_descent(b, x, y, l):
    b = (1/m) * ((sigmoid(x.dot(b.T)) - y).T.dot(x))
    b[1:] = b[1:] + b[1:]*(l/m)
    return b

def accuracy(x, y, b):
    p = sigmoid(x.dot(b.T))
    return [1 if i>=0.5 else 0 for i in p]

#Output
initial_cost = cost(b, x, y, l)
print(f"\nInitial Cost = {initial_cost} \nInitial Theta = {b}")

result = minimize(cost, b, args=(x,y.flatten(),l), jac=gradient_descent)
b = result.x
print(f"Beta Values: {b}")
print(result)

prediction = accuracy(x, y, b)
for e,f in zip(prediction,y):
    if(e == f):
        count+=1

accuracy_score = count/m * 100
print(f"Accuracy = {accuracy_score}")

数据集为：Microchip Test Dataset

正则逻辑回归精度低

0 个答案: