我正在进行文本分类。我有大约32K(垃圾邮件和火腿)文件。
import numpy as np
import pandas as pd
import sklearn.datasets as dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import SGDClassifier
from BeautifulSoup import BeautifulSoup
from sklearn.feature_extraction import text
from sklearn import cross_validation
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
# Now load files from spam and ham
data = dataset.load_files("/home/voila/Downloads/enron1/")
xData = data.data
yData = data.target
print data.target_names
countVector = CountVectorizer(decode_error='ignore' , stop_words = 'english')
countmatrix = countVector.fit_transform(xData)
countermatrix
将成为矩阵,countermatrix[i][j]
表示文档j
中的单词i
的计数
现在我要删除超过80%的文档中存在countermatrix[i][j] > 1
(意味着单词太常见)的所有功能。
我该怎么做?
由于
答案 0 :(得分:2)
您可以将max_df设置为小于1的值,然后参见docs。
答案 1 :(得分:1)
试试这个:
goodwords = ((countmatrix > 1).mean(axis=0) <= 0.8).nonzero()[0]
它首先计算一个布尔矩阵,如果countmatrix > 1
则为True,并计算它的逐列均值。如果均值小于0.8(80%),则nonzero()
返回相应的列索引。
因此,goodwords
将包含不太频繁的所有单词索引。现在您可以通过
countmatrix = countmatrix[:, goodwords]
答案 2 :(得分:0)
我认为您可以按列检索矩阵:
def remove_feature():
remove_index = []
for index in range(0, len(countermatrix.T)):
if countermatrix[:, index].max() > 1:
remove_index.append(index)
return numpy.delete(countermatrix, remove_index, 1)