我正在尝试减少包含 7,000 多个不同变量的大型数据集,以便对数据运行一些机器学习。我认为最好的方法是运行 MLR,找到 p 值,然后从那里缩短它。如果有人对如何减少有更好的想法,我将不胜感激。
因为许多变量对于 R 和 Stata 来说太大了,所以我只使用了 python 并且一直在使用 Sklearn 而没有查看摘要,有没有人知道潜在的解决方法?
import numpy as np
import pandas as pd
# import sklearn.model_selection import train_test_split
# from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
cadet_data = pd.read_csv('CE With Comma_V1.csv')
# print(cadet_data.iloc[:,7424].values)
# print(cadet_data.iloc[:,:-1].values)
x = cadet_data.iloc[:,:-1].values
y = cadet_data.iloc[:,7424].values
x = np.squeeze(np.asarray(x))
y = np.squeeze(np.asarray(y))
#
# print(x)
# print(y)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,Y_train)
y_pred= regressor.predict(X_test)
test_set_r2 = r2_score(Y_test, y_pred)
regressor_OLS=sm.OLS(endog=x,exog=y).fit()
regressor_OLS.summary()
这给了我以下错误:
Traceback (most recent call last):
File "C:\Users\......", ", line 34, in <module>
regressor_OLS.summary()
File "C:\Users\......", ", line 2640, in summary
["%#8.3f" % self.rsquared]),
File "pandas\_libs\properties.pyx", line 33, in pandas._libs.properties.CachedProperty.__get__
File "C:\Users\......", ", line 1717, in rsquared
return 1 - self.ssr/self.uncentered_tss
File "pandas\_libs\properties.pyx", line 33, in pandas._libs.properties.CachedProperty.__get__
File "C:\Users\......", line 1656, in ssr
return np.dot(wresid, wresid)
File "<__array_function__ internals>", line 5, in dot
ValueError: shapes (332,7424) and (332,7424) not aligned: 7424 (dim 1) != 332 (dim 0)
Process finished with exit code 1