我已经从句子中手动提取了一些功能,例如POS标签,headwords等,还使用CountVectorizer
创建了单词袋。我正在尝试进行嵌套交叉验证,我想创建带有各种参数的单词袋。为此,我使用了Pipeline
类,但遇到了错误。
我从CountVectorizer
类继承来并入了各种设置。但是,当我将词袋与其他功能(转换为csr_matrix
)结合使用时,会出现以下错误:X has 4058 features per sample; expecting 4766
。
class FeatureCombiner(CountVectorizer):
def __init__(self,analyzer='word',stop_words=None,ngram_range=(1, 1)):
CountVectorizer.__init__(self)
def fit_transform(self, x, y=None, **fit_params):
x_indices = x.index
vect = DictVectorizer()
temp_feats = np.take(features, x_indices, axis=0)
eng_features = vect.fit_transform(temp_feats)
bow = CountVectorizer.fit_transform(self, x, y, **fit_params)
final_features = scipy.sparse.hstack([eng_features, bow])
final_features = sklearn.preprocessing.normalize(final_features, axis=0)
return final_features
pipeline = Pipeline([
('vec', FeatureCombiner()),
# ('vec', CountVectorizer()),
('clf', LinearSVC(random_state=0))
])
parameters = {
'clf__C': [0.1, 1, 10],
'vec__stop_words': [None, 'english']
}
NUM_TRIALS = 1
for i in range(NUM_TRIALS):
inner_cv = KFold(n_splits=5, shuffle=True, random_state=i)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=i)
clf = GridSearchCV(pipeline, parameters, cv=inner_cv,
iid=False)
clf.fit(sentences, labels)
nested_score = cross_val_score(clf, X=sentences, y=labels, cv=outer_cv, scoring="f1")
print(clf.best_estimator_)