我正在使用两个单独的CSV文件中的实际数据处理文本分类问题。
如何在下面看到我在我的管道中创建了两个Numpy数组和流程分类。我的欲望输出是带有X_test和Y_test的CSV文件。其中Y_test将包含分类器预测的值。
# coding: utf-8
# In[24]:
import pandas as pd
data = pd.read_csv('test_data4.csv',encoding='latin1', dtype={'SourcePath': str}, )
# In[25]:
import numpy as np
numpy_array = data.as_matrix()
X_train = numpy_array[:, 0]
Y_train = numpy_array[:, 1]
# In[26]:
data.head()
# In[27]:
data_test = pd.read_csv('data_go.csv',encoding='latin1', dtype={'SourcePath': str}, )
# In[28]:
data_test.head()
# In[29]:
numpy_array = data_test.as_matrix()
X_test = numpy_array[:, 0]
# In[30]:
print(X_test)
# In[31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])
text_clf = text_clf.fit(X_train, Y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)
# In[43]:
print(pd.DataFrame(X_test, Y_test).T.to_csv(index=False, header=None))