我有一个具有大量功能的计数矢量化器,我希望能够从变换集中选择k个最佳特征,然后更新count_vectorizer以仅包含这些特征。这可能吗?
import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as ss
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
merge=re.compile('\*\|.+?\|\*')
def stripmerge(sub):
for i in merge.findall(sub):
j=i
j=j.replace('*|','mcopen')
j=j.replace('|*','mcclose')
j=re.sub('[^0-9a-zA-Z]','',j)
sub=sub.replace(i,j)
return sub
input=pd.read_csv('subject_tool_test_23.csv')
input.subject[input.subject.isnull()]=' '
subjects=np.asarray([stripmerge(i) for i in input.subject])
count_vectorizer = CountVectorizer(strip_accents='unicode', ngram_range=(1,1), binary=True, stop_words='english', max_features=500)
counts=count_vectorizer.fit_transform(subjects)
#see the first output example here
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
good=np.asarray(input.unique_open_performance>0)
count_new = SelectKBest(chi2, k=250).fit_transform(counts, good)
第一个输出示例,功能有意义
>>> counts[1]
<1x500 sparse matrix of type '<type 'numpy.int64'>'
with 3 stored elements in Compressed Sparse Row format>
>>> subjects[1]
"Lake Group Media's Thursday Target"
>>> count_vectorizer.inverse_transform(counts[1])
[array([u'group', u'media', u'thursday'],
dtype='<U18')]
第二个输出示例,功能不再匹配。
>>> count_new = SelectKBest(chi2, k=250).fit_transform(counts, good)
>>> count_new.shape
(992979, 250)
>>> count_new[1]
<1x250 sparse matrix of type '<type 'numpy.int64'>'
with 2 stored elements in Compressed Sparse Row format>
>>> count_vectorizer.inverse_transform(count_new[1])
[array([u'independence', u'easy'],
dtype='<U18')]
>>> subjects[1]
"Lake Group Media's Thursday Target"
有没有办法将特征选择结果应用到我的计数矢量器中,这样我才能生成只有重要特征的新矢量?
答案 0 :(得分:3)
我解决这个问题的方法是运行特征选择,确定选择原始集合中的哪些列,从中创建字典,然后运行仅限于该字典的新计数向量。使用大型数据集需要更长时间,但它可以工作。
ch2 = SelectKBest(chi2, k = 3000)
count_new = ch2.fit_transform(counts, good)
dict=np.asarray(count_vectorizer.get_feature_names())[ch2.get_support()]
count_vectorizer=CountVectorizer(strip_accents='unicode', ngram_range=(1,1), binary=True, vocabulary=dict)
答案 1 :(得分:1)
我相信这就是你要找的东西。它是一个修改过的SelectKBest对象,可以转换词汇表对象(术语:索引词典)或CountVectorizer对象并更新其词汇表。无需重新提取所有功能。
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
class CustomSelectKBest(SelectKBest):
"""
Extending SelectKBest with the ability to update a vocabulary that is given
from a CountVectorizer object.
"""
def __init__(self, score_func=f_classif, k=10):
super(CustomSelectKBest, self).__init__(score_func, k)
def transform_vocabulary(self, vocabulary):
mask = self.get_support(True)
i_map = { j:i for i, j in enumerate(mask) }
return { k:i_map[i] for k, i in vocabulary.iteritems() if i in i_map }
def transform_vectorizer(self, cv):
cv.vocabulary_ = self.transform_vocabulary(cv.vocabulary_)
if __name__ == '__main__':
def score_func(X, y):
# Fake scores and p-values
return (np.arange(X.shape[1]), np.zeros(X.shape[1]))
# Create test data.
size = (4, 10)
X = (np.random.randint(0,5, size=size))
y = np.random.randint(2, size=size[0])
vocabulary = {chr(i+ord('a')):i for i in range(size[1])}
skb = CustomSelectKBest(score_func=score_func, k=5)
X_s = skb.fit_transform(X, y)
vocab_s = skb.transform_vocabulary(vocabulary)
# Confirm they have the right values.
for k, i_s in vocab_s.iteritems():
i = vocabulary[k]
assert((X_s[:,i_s].T == X[:,i].T).all())
print 'Test passed'
答案 2 :(得分:1)
使用Pipeline让您的生活更轻松。 Pipeline将自动对测试数据应用转换。您不必手动重新创建矢量图。
text_clf_red = Pipeline([('vect', CountVectorizer()),
('reducer', SelectKBest(chi2, k=3000)),
('clf', MultinomialNB())
])
text_clf_red.fit(X_train, y_train)
y_test_pred = text_clf_red.predict(X_test)
metrics.accuracy_score(y_test, y_test_pred)