我为功能联合定义了一个类。 python 2.7抱怨" AttributeError:' module' object没有属性" TextTransformer"。代码可以在Kaggle平台上运行,但不能在我的本地ipython上运行。
from sklearn.base import BaseEstimator, TransformerMixin
class TextTransformer(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key].apply(str)
rfr = RandomForestRegressor()
tfidf = TfidfVectorizer()
tsvd = TruncatedSVD(n_components=10)
clf = pipeline.Pipeline([
('union', FeatureUnion(
transformer_list = [
('txt1', pipeline.Pipeline([('s1', TextTransformer(key='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])),
('txt2', pipeline.Pipeline([('s2', TextTransformer(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
('txt3', pipeline.Pipeline([('s3', TextTransformer(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
('txt4', pipeline.Pipeline([('s4', TextTransformer(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)]))
],
transformer_weights = {
'txt1': 0.5,
'txt2': 0.25,
'txt3': 0.25,
'txt4': 0.5
},
n_jobs = -1
)),
('rfr', rfr)])
param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid,n_jobs = -1, cv = 10)
model.fit(X_train, y_train)
答案 0 :(得分:1)
你可能忘记了一些导入。试试这个,它对我有用。
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import *
from sklearn.feature_extraction.text import *
from sklearn.decomposition import *
from sklearn.pipeline import *
from sklearn.grid_search import *
class TextTransformer(TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key].apply(str)
rfr = RandomForestRegressor()
tfidf = TfidfVectorizer()
tsvd = TruncatedSVD(n_components=10)
clf = Pipeline([
('union', FeatureUnion(
transformer_list = [
('txt1', Pipeline([('s1', TextTransformer(key='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])),
('txt2', Pipeline([('s2', TextTransformer(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
('txt3', Pipeline([('s3', TextTransformer(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
('txt4', Pipeline([('s4', TextTransformer(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)]))
],
transformer_weights = {
'txt1': 0.5,
'txt2': 0.25,
'txt3': 0.25,
'txt4': 0.5
},
n_jobs = -1
)),
('rfr', rfr)])
param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
model = GridSearchCV(estimator = clf, param_grid = param_grid,n_jobs = -1, cv = 10)
model.fit(X_train, y_train)