我一直在尝试使用python使用正则化来实现Andrew Ng的Logistic回归,但是要对其进行优化以获取正确的精度却很难。在文档中提到,当lambda = 1时,精度应为83%,但是我的精度为64%。
如果有人能指出如何提高准确性,将不胜感激!
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from sklearn.preprocessing import PolynomialFeatures
#Import Dataset
dataset = pd.read_csv("Microchip Test Dataset.txt", names=["Test 1", "Test 2", "Accepted"])
print(dataset.head())
positive = dataset[dataset["Accepted"] == 1]
negative = dataset[dataset["Accepted"] == 0]
#Visualizing Dataset
plt.scatter(positive["Test 1"], positive["Test 2"], color="red", marker="o", label="Accepted")
plt.scatter(negative["Test 1"], negative["Test 2"], color="blue", marker="x", label="Rejected")
plt.title("Microchip Test")
plt.xlabel("Test 1")
plt.ylabel("Test 2")
plt.legend()
#plt.show()
#Preprocessing Data
col = len(dataset.columns)
x1 = dataset["Test 1"].values
x2 = dataset["Test 2"].values
dataset.insert(col, "Ones", 1)
#Creating Polynomial Features MANUALLY
degree = 5
for i in range(1, degree):
for j in range(i):
dataset["P"+ str(i)+ str(j)] = np.power(x1, i-j) *
np.power(x2,j)
dataset.drop(columns=["Test 1", "Test 2"], inplace=True)
print("\nModified Dataset")
print(dataset.head())
col = len(dataset.columns)
x = dataset.iloc[:,1:col].values
y = dataset.iloc[:,0:1].values
b = np.zeros(11)
m = len(x)
l = 1
count = 0
print(f"\nX Shape : {x.shape} Y Shape : {y.shape} B Shape : {b.shape}")
#Defining Functions
def sigmoid(z):
h = 1 / (1 + np.exp(-z))
return h
def cost(b, x, y, l):
first = y.T.dot(np.log(sigmoid(x.dot(b.T))))
second = (1-y).T.dot(np.log((1 - sigmoid(x.dot(b.T)))))
reg_parameter = (l/(2*m)) * np.sum(b[1:] ** 2)
j = (-1/m)*np.sum(first+second) + reg_parameter
return j
def gradient_descent(b, x, y, l):
b = (1/m) * ((sigmoid(x.dot(b.T)) - y).T.dot(x))
b[1:] = b[1:] + b[1:]*(l/m)
return b
def accuracy(x, y, b):
p = sigmoid(x.dot(b.T))
return [1 if i>=0.5 else 0 for i in p]
#Output
initial_cost = cost(b, x, y, l)
print(f"\nInitial Cost = {initial_cost} \nInitial Theta = {b}")
result = minimize(cost, b, args=(x,y.flatten(),l), jac=gradient_descent)
b = result.x
print(f"Beta Values: {b}")
print(result)
prediction = accuracy(x, y, b)
for e,f in zip(prediction,y):
if(e == f):
count+=1
accuracy_score = count/m * 100
print(f"Accuracy = {accuracy_score}")