X_train, X_test, Y_train, Y_test =\
train_test_split(shuffled_df.sentence_clean,shuffled_df.pol,test_size = 0.30,random_state=42)
tfidf1 = TfidfVectorizer(min_df=0.008, max_df=0.1)
#using tfidf to extract features as
x_traincv = tfidf1.fit_transform(X_train)
x_testcv = tfidf1.fit_transform(X_test)
#Setting the polarities as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')
c = tree.DecisionTreeClassifier()
#Training the data
c.fit(x_traincv,Y_train)
#问题在这里发生。我拼命试图计算模型的准确性。样本内和样本外错误。
training_accuracy = np.sum(c.predict(x_traincv) == Y_train)/float(Y_train.size)
testing_accuracy = np.sum(c.predict(x_testcv) == Y_test)/float(Y_test.size)
#我得到的错误是:模型的特征数量必须与输入匹配。模型n_features是251,输入n_features是248
#请让我知道是否有更好的方法。