我正在加载用于分类器培训的预先分类的文件。我对它们进行了修改,然后使用tfidf计算了重量,并使用了chi2 selelction特征方法。但durinig multinomialNB训练出现错误。代码是
from sklearn.datasets import load_files
train_files=load_files('c:\\python34\\nltk_data\\corpora\\crimettest train',shuffle=True)
#vector data to frequncy vales
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_files.data)
#vector data to TF vales
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
from sklearn.feature_selection import chi2
X_new = chi2(X_train_tfidf,train_files.target)
#train classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_new,train_files.target)
然后我修改了我的代码,改变了最后一行:
X= SelectKBest(chi2,k=20)
X_new=X.fit_transform(X_train_tfidf,train_files.target)
#train classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_new,train_files.target)
#predict new documents
path='c:\\python34\\nltk_data\\corpora\\crime\\class'
corpus = PlaintextCorpusReader (path,'.*\.txt')
docs_new=[]
files=corpus.fileids()
for i in files :
f=(open(path+'\\'+i , 'r')).read()
docs_new.append(f)
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc,category in zip(files, predicted):
print('predicted',doc,train_files.target_names[category])
任何人都可以帮助我吗?