Question

我正在尝试通过删除低于5％信号水平的变量来优化逻辑回归。

我想知道我是否可以使用OLS或仅Logit适合方法？

当我使用Logit fit方法时，如果我将特征数量增加到一定数量以上，则会收到一个名为“ LinAlgError：奇异矩阵”的错误。当我删除其中一些功能时，logit fit方法会起作用。

可以添加到logit.fit的功能数量是否受到限制？或者是由其他原因引起的。

我的下面的代码

# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
import pyodbc

#Setup SQL connection

conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=xxxxxx;'
                      'Database=Datawarehouse;'
                      'Trusted_Connection=yes;')
cursor = conn.cursor()
dataset = pd.read_sql('SELECT * from [VIEW_ContactsLeadsModel_AU]', con=conn)
conn.close()

#Create a copy of the data frame

ContactsOutput = dataset.copy()

#Fill Null Values

#dataset.isnull().any() 

dataset['Email'] = dataset['Email'].fillna(value = 'UEmail')
dataset['WorkplaceFunction'] = dataset['WorkplaceFunction'].fillna(value = 'UWorkplace')
dataset['IndustryLevel1'] = dataset['IndustryLevel1'].fillna(value = 'UIND1')
dataset['IndustryLevel2'] = dataset['IndustryLevel2'].fillna(value = 'UIND2')
dataset['IndustryLevel3'] = dataset['IndustryLevel3'].fillna(value = 'UIND3')
dataset['Title'] = dataset['Title'].fillna(value = 'UTitle')
dataset['LeadRating'] = dataset['LeadRating'].fillna(value = 'URating')


#Drop Variables not needed

dataset.drop(['ContactIDno','Email','MobilePhone','Phone','ContactName','Title','MailingCountry','LeadRating','LeadBuid','LeadDiv','LeadDPC','LeadSBU'],axis=1,inplace=True)

#Create Dummy variables

ContactDummy = pd.get_dummies(dataset['Contact_status__c'],drop_first=True)
WFunctionDummy = pd.get_dummies(dataset['WorkplaceFunction'],drop_first=True)
Industry1Dummy = pd.get_dummies(dataset['IndustryLevel1'],drop_first=True)
Industry2Dummy = pd.get_dummies(dataset['IndustryLevel2'],drop_first=True)
Industry3Dummy = pd.get_dummies(dataset['IndustryLevel3'],drop_first=True)
StateDummy = pd.get_dummies(dataset['MailingState'],drop_first=True)
LeadSourceDummy = pd.get_dummies(dataset['LeadSource'],drop_first=True)

#Drop original variables that have been dummied

dataset.drop(['Contact_status__c','WorkplaceFunction','IndustryLevel1','IndustryLevel2','IndustryLevel3'],axis=1,inplace=True)
dataset.drop(['MailingState','LeadSource','MailingCity'],axis=1,inplace=True)

#Move Target to the first line of Dataframe

dataset = dataset[['Target','MobilePhoneFlag']]

#Add new dummies to data
#dataset = pd.concat([dataset,ContactDummy,WFunctionDummy,Industry1Dummy,LeadSourceDummy,StateDummy],axis=1)  #causes error LinAlgError: Singular matrix
dataset = pd.concat([dataset,ContactDummy,WFunctionDummy,Industry1Dummy,LeadSourceDummy],axis=1)  #does not cause error LinAlgError: Singular matrix

#Create a view of your dataset

#Delete Dummy variables after moving back to DF

del ContactDummy
del WFunctionDummy
del Industry1Dummy
del Industry2Dummy
del Industry3Dummy
del StateDummy
del LeadSourceDummy

# Creating the dataset

X = dataset.drop('Target',axis = 1)
y = dataset['Target']

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit()
print(result.summary2())

如何获得Logistic回归的P值

0 个答案: