我有问题的代码是Scikit-Learn对训练集的统计分析。这是简单线性回归的代码,它使用Scikit Learn重现Statsmodels的输出,但是当我尝试对其进行更改以执行它时在多元线性回归中,我失败了。您有任何想法如何对其进行更改以使其正常工作吗?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
data = pd.read_excel ("C:\\Users\\Aymen\\Desktop\\Multiple_Linear_Regression\\Excels\\ABC.xlsx",'Sheet1') #Import Excel file
# Replace null values of the whole dataset with 0
data1 = data.fillna(0)
# Extraction of the independent and dependent variables
X = data1.iloc[0:len(data1),[1,2,3,4,5]] #Extract the column of the COPCOR SP we are going to check its impact
Y = data1.iloc[0:len(data1),6] #Extract the column of the PAUS SP
XY = data1.iloc[0:len(data1),[1,2,3,4,5,6]] #Extract the column of the COPCOR SP we are going to check its impact
# Data Splitting to train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size =0.25,random_state=42)
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(X_train)
X_train = pd.DataFrame(scaled_features, index=X_train.index, columns=X_train.columns)
scaled_features = StandardScaler().fit_transform(X_test)
X_test = pd.DataFrame(scaled_features, index=X_test.index, columns=X_test.columns)
# Statistical Analysis of the training set with Statsmodels
X_All = sm.add_constant(X_train) # add a constant to the model
est = sm.OLS(Y_train, X_All).fit()
print(est.summary()) # print the results
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math
lm = LinearRegression() # create an lm object of LinearRegression Class
lm.fit(X_train,Y_train) # train our LinearRegression model using the training set of data - dependent and independent variables as parameters. Teaching lm that Y_train values are all corresponding to X_train.
mse_test = mean_squared_error(Y_test, lm.predict(X_test))
# Statistical Analysis of the training set with Scikit-Learn
params1 = np.append(lm.intercept_,lm.coef_)
predictions1 = lm.predict(X_train)
newX1 = pd.DataFrame({"Constant":np.ones(len(X_train))}).join(pd.DataFrame(X_train))
MSE1 = (sum((Y_train-predictions1)**2))/(len(newX1)-len(newX1.columns))
var_b1 = MSE1*(np.linalg.inv(np.dot(newX1.T,newX1)).diagonal())
sd_b1 = np.sqrt(var_b1)
ts_b1 = params1/ sd_b1
p_values1 =[2*(1-stats.t.cdf(np.abs(i),(len(newX1)-1))) for i in ts_b1]
sd_b1 = np.round(sd_b1,3)
ts_b1 = np.round(ts_b1,3)
p_values1 = np.round(p_values1,5)
params1 = np.round(params1,4)
myDF2 = pd.DataFrame()
myDF2["Coefficients"],myDF2["Standard Errors"],myDF2["t values"],myDF2["P-values"] = [params1,sd_b1,ts_b1,p_values1]
# Data Splitting to train and test set of the reduced data
X_1 = data1.iloc[0:len(data1),[2,3]] #Extract the column of the COPCOR SP we are going to check its impact
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_1, Y, test_size =0.25,random_state=42)
scaled_features = StandardScaler().fit_transform(X_train2)
X_train2 = pd.DataFrame(scaled_features, index=X_train2.index, columns=X_train2.columns)
scaled_features = StandardScaler().fit_transform(X_test2)
X_test2 = pd.DataFrame(scaled_features, index=X_test2.index, columns=X_test2.columns)
# Statistical Analysis of the reduced model with Statsmodels
X_reduced = sm.add_constant(X_train2) # add a constant to the model
est_reduced = sm.OLS(Y_train2, X_reduced).fit()
print(est_reduced.summary()) # print the results
# Fitting a Linear Model for the reduced model with Scikit-Learn
lm1 = LinearRegression() #create an lm object of LinearRegression Class
lm1.fit(X_train2, Y_train2)
mse_test1 = mean_squared_error(Y_test2, lm1.predict(X_test2))
#Cross Validation and Training again the model
from sklearn.model_selection import KFold
from sklearn import model_selection
kf = KFold(n_splits=6, random_state=42)
for train_index, test_index in kf.split(X_train):
print("Train:", train_index, "Validation:",test_index)
X_train1, X_test1 = X.iloc[train_index], X.iloc[test_index]
Y_train1, Y_test1 = Y.iloc[train_index], Y.iloc[test_index]
results = -1 * model_selection.cross_val_score(lm1, X_train1, Y_train1,scoring='neg_mean_squared_error', cv=kf)
#RMSE values interpretation
#Good model built no overfitting or underfitting (Barely Same for test and training5/6 : Goal of Cross validation but low prediction accuracy = Value is big
import seaborn
seaborn.heatmap(Corr,cmap='RdYlGn_r',vmax=1.0,vmin=-1.0,mask=mask, linewidths=2.5)
Last 26 weeks Variable Number 1
0 201823 0
1 201824 0
2 201825 0
3 201826 0
4 201827 0
5 201828 105000
6 201829 0
7 201830 -105000
8 201831 0
9 201832 0
10 201833 0
11 201834 0
12 201835 0
13 201836 0
14 201837 0
15 201838 0
16 201839 0
17 201840 0
18 201841 0
19 201842 0
20 201843 0
21 201844 0
22 201845 0
23 201846 0
24 201847 0
25 201848 0
Variable Number 2
0 0
1 0
2 0
3 0
4 543000
5 0
6 0
7 0
8 0
9 0
10 0
11 120000
12 0
13 -3000
14 0
15 0
16 -75000
17 -36000
18 228000
19 0
20 0
21 0
22 0
23 630000
24 0
25 -132000
Variable Number 3 \
0 0
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 0
18 0
19 345000
20 0
21 0
22 0
23 0
24 0
25 0
Variable Number 4 Variable Number 5\
0 714000 0
1 0 57000
2 0 0
3 0 0
4 0 0
5 0 6000
6 0 0
7 0 0
8 0 0
9 0 0
10 0 0
11 0 0
12 0 3000
13 0 0
14 0 0
15 0 24000
16 0 0
17 0 0
18 0 0
19 0 0
20 0 3000
21 0 0
22 0 0
23 0 0
24 0 138000
25 0 48000
Variable Number 6
0 765000
1 57000
2 0
3 0
4 615000
5 111000
6 0
7 0
8 0
9 0
10 0
11 237000
12 165000
13 0
14 0
15 24000
16 0
17 0
18 357000
19 429000
20 3000
21 21000
22 0
23 630000
24 138000
25 48000