我编写了使用scikit-learn保存和恢复模型的代码。对于新数据的新测试,根据需要编写新内容。我已经关注了关于处理文本数据的scikit-learn文档。
filename = 'model_bengali.pkl'
vectorizer = CountVectorizer(encoding='utf-8', decode_error='strict')
X_train = vectorizer.fit_transform(doc_train.data)
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(X_train)
clf = MultinomialNB().fit(x_train_tfidf, doc_train.target)
#Building a pipeline for existing data
text_clf = Pipeline([('vectorizer', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
text_clf.fit(doc_train.data, doc_train.target)
# To delete stop_words property of vectorizer
vectorizer.stop_words_ = None
with open(filename , 'wb') as fout:
pickle.dump((text_clf), fout)
'''
TO read and write content of new data. Converting into vector and calculation
its TF-IDF value.
'''
f= open('text_to_predict_bengali.txt','w+')
while True:
b = input("Press any character to begin and copy text to be tested on next line ->")
# Readline return list, while read return string
inp_str = sys.stdin.read()
if not b:
break
f.write(inp_str)
f = open('text_to_predict_bengali.txt','r')
x = f.split()
with open(filename, 'rb') as fin:
text_clf = pickle.load(fin)
X_new_counts = vectorizer.transform(x)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
print (X_new_tfidf)
#Training a classifier for new data.
clf = MultinomialNB().fit(x_train_tfidf, doc_train.target)
X_new_pred = clf.predict(X_new_tfidf)
print (X_new_pred)
#To predict the class of new data.
f= open('text_to_predict_bengali.txt','a+')
for doc, category in zip(x, X_new_pred):
#f.write(print("%r => %s" %(doc, doc_train.target_names[category])))
print('%r => %s' %(doc,doc_train.target_names[category]), file = f)
print (cross_val_score(text_clf, doc_train.data, doc_train.target, cv=5))
f.close()
pickle.dump()的参数应该是什么。 当我读取新数据的文件内容时,每个单词都会转换为列表,这不是我想要获得的。 我想要的是,应该一次读取文件的多个段落,并且必须将整个内容转换为单个列表?
答案 0 :(得分:0)
试试这个
get_the_permalink()