是否有可能在scikit中计算特征重要性(使用随机森林)了解功能何时被编码?
答案 0 :(得分:2)
以下是如何将要素名称与其重要性结合起来的示例:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
# some example data
X = pd.DataFrame({'feature': ['value1', 'value2', 'value2', 'value1', 'value2']})
y = [1, 0, 0, 1, 1]
# translate rows to dicts
def row_to_dict(X, y=None):
return X.apply(dict, axis=1)
# define prediction model
ft = FunctionTransformer(row_to_dict, validate=False)
dv = DictVectorizer()
rf = RandomForestClassifier()
# glue steps together
model = make_pipeline(ft, dv, rf)
# train
model.fit(X, y)
# get feature importances
feature_importances = zip(dv.feature_names_, rf.feature_importances_)
# have a look
print feature_importances