首先,我要说的是我正在处理具有文本功能的数据集,因此我尝试在其上套上单词和tf-idf。第一步,我在特征上应用了CountVectorizer和TfidfTransformer并通过训练数据训练了MultinomialNb;但是,当我转换测试数据时,会遇到此错误:
ValueError:模型已使用n_features = 48064训练时,输入具有n_features = 11805
有关更多详细信息,我的代码如下:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
df = pd.read_csv('DK-BM-products.csv')
df.head()
col = ['url_id', 'bm_title', 'bm_brand', 'bm_cat1', 'bm_cat2', 'bm_cat3','bm_warranty', 'DK_Entitle', 'DK_Fatitle', 'DK_titlealt', 'DK_titleKey','DK_catFa', 'Dk_catEn', 'DK_maincatFa', 'DK_maincatEn', 'DK_brandEn', 'DK_brandFa']
df = df[col]
df.dropna(inplace=True)
# Train Test Splitting
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]
y_train = train['DK_maincatEn']
x_train = train.drop(['DK_maincatEn', 'url_id'], axis=1)
y_test = test['DK_maincatEn']
x_test = test.drop(['DK_maincatEn', 'url_id'], axis=1)
# Feature Engineering
count_vect = CountVectorizer(encoding='utf-8', stop_words='english', strip_accents='unicode', lowercase=True, analyzer='word')
x_train_counts = count_vect.fit_transform(x_train)
tfidf_transformer = TfidfTransformer(use_idf=True)
features_train = tfidf_transformer.fit_transform(x_train_counts)
#Train CLF
clf = MultinomialNB().fit(features_train, y_train)
accu_train = np.mean(clf.predict(features_train) == y_train)
#Test
x_test_counts = count_vect.transform(x_test)
features_test = tfidf_transformer.transform(x_test_counts)
predicted = clf.predict(features_test)