from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
我有X和Y数据。
data = load_iris()
X = data.data
Y = data.target
我想用k折验证方法来实现RFECV特征选择和预测。
clf = RandomForestClassifier()
kf = KFold(n_splits=2, shuffle=True, random_state=0)
estimators = [('standardize' , StandardScaler()),
('clf', clf)]
class Mypipeline(Pipeline):
@property
def coef_(self):
return self._final_estimator.coef_
@property
def feature_importances_(self):
return self._final_estimator.feature_importances_
pipeline = Mypipeline(estimators)
rfecv = RFECV(estimator=pipeline, cv=kf, scoring='accuracy', verbose=10)
rfecv_data = rfecv.fit(X, Y)
print ('no. of selected features =', rfecv_data.n_features_)
X_new = rfecv.transform(X)
print X_new.shape
y_predicts = cross_val_predict(clf, X_new, Y, cv=kf)
accuracy = accuracy_score(Y, y_predicts)
print ('accuracy =', accuracy)
答案 0 :(得分:2)
与其将StandardScaler和RFECV包装在同一管道中,不如对StandardScaler和RandomForestClassifier进行包装,然后将该管道作为估计器传递给RFECV。在这种情况下,不会泄漏任何培训信息。
estimators = [('standardize' , StandardScaler()),
('clf', RandomForestClassifier())]
pipeline = Pipeline(estimators)
rfecv = RFECV(estimator=pipeline, scoring='accuracy')
rfecv_data = rfecv.fit(X, Y)
更新:关于错误'RuntimeError: The classifier does not expose "coef_" or "feature_importances_" attributes'
是的,这是scikit-learn管道中的一个已知问题。您可以查看其他answer here for的更多详细信息,并使用我在此处创建的新管道。
定义这样的自定义管道:
class Mypipeline(Pipeline):
@property
def coef_(self):
return self._final_estimator.coef_
@property
def feature_importances_(self):
return self._final_estimator.feature_importances_
并使用它:
pipeline = Mypipeline(estimators)
rfecv = RFECV(estimator=pipeline, scoring='accuracy')
rfecv_data = rfecv.fit(X, Y)
@brute,对于您的数据和代码,算法可在一分钟之内在我的PC上完成。这是我使用的完整代码:
import numpy as np
import glob
from sklearn.utils import resample
files = glob.glob('/home/Downloads/Untitled Folder/*')
outs = []
for fi in files:
data = np.genfromtxt(fi, delimiter='|', dtype=float)
data = data[~np.isnan(data).any(axis=1)]
data = resample(data, replace=False, n_samples=1800, random_state=0)
outs.append(data)
X = np.vstack(outs)
print X.shape
Y = np.repeat([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 1800)
print Y.shape
#from sklearn.utils import shuffle
#X, Y = shuffle(X, Y, random_state=0)
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
clf = RandomForestClassifier()
kf = KFold(n_splits=10, shuffle=True, random_state=0)
estimators = [('standardize' , StandardScaler()),
('clf', RandomForestClassifier())]
class Mypipeline(Pipeline):
@property
def coef_(self):
return self._final_estimator.coef_
@property
def feature_importances_(self):
return self._final_estimator.feature_importances_
pipeline = Mypipeline(estimators)
rfecv = RFECV(estimator=pipeline, scoring='accuracy', verbose=10)
rfecv_data = rfecv.fit(X, Y)
print ('no. of selected features =', rfecv_data.n_features_)
X_new = rfecv.transform(X)
print X_new.shape
# Here change clf to pipeline,
# because RFECV has found features according to scaled data,
# which is not present when you pass clf
y_predicts = cross_val_predict(pipeline, X_new, Y, cv=kf)
accuracy = accuracy_score(Y, y_predicts)
print ('accuracy =', accuracy)
答案 1 :(得分:-1)
这是我们的处理方式:
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
data = load_iris()
X = data.data, Y = data.target
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, shuffle=True)
# create model
clf = RandomForestClassifier()
# instantiate K-Fold
kf = KFold(n_splits=10, shuffle=True, random_state=0)
# pipeline estimators
estimators = [('standardize' , StandardScaler()),
('rfecv', RFECV(estimator=clf, cv=kf, scoring='accuracy'))]
# instantiate pipeline
pipeline = Pipeline(estimators)
# fit rfecv to train model
rfecv_model = rfecv_model = pipeline.fit(X_train, y_train)
# print number of selected features
print ('no. of selected features =', pipeline.named_steps['rfecv'].n_features_)
# print feature ranking
print ('ranking =', pipeline.named_steps['rfecv'].ranking_)
'Output':
no. of selected features = 3
ranking = [1 2 1 1]
# make predictions on the test set
predictions = rfecv_model.predict(X_test)
# evaluate the model performance using accuracy metric
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
'Output':
Accuracy: 0.9736842105263158