Question

我有一个类似以下的数据集，其中包含大约一千个观测值。我想创建一个在negative和positive类之间分类的SVM超平面图。我使用了基本上从the Scikit-learn website开始采用的后续代码。但是，它给了我这个错误：

ValueError：找到样本数量不一致的输入变量：[2，214]

 df_1.head() 
                                              text   cat  lexicon
   0   [india, education, commission, report, ma...   1   negative
   1   [national, education, policy, 2017, by, min... 2   positive
              ......
 1000  [india, education, commission, report, ma...

我使用以下步骤

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm

vectorizer = CountVectorizer(stop_words='english', min_df = 10)

text = df_1.text.tolist()
x = vectorizer.fit_transform(text)
y = df_1.lexicon.tolist()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, 
random_state=0)

fig, ax = plt.subplots()
model = svm.LinearSVC(C=1).fit(x_train, y_train)

# get the separating hyperplane
w = model.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (model.intercept_[0]) / w[1]

# create a mesh to plot in
x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1
y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1
xx2, yy2 = np.meshgrid(np.arange(x_min, x_max, .2),
                 np.arange(y_min, y_max, .2))
Z = model.predict(np.c_[xx2.ravel(), yy2.ravel()])

Z = Z.reshape(xx2.shape)
ax.contourf(xx2, yy2, Z, cmap=plt.cm.coolwarm, alpha=0.3)
ax.scatter(x[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, s=25)
ax.plot(xx,yy)

ax.axis([x_min, x_max,y_min, y_max])
plt.show()

收到上述错误后，我还尝试使用以下代码转换x和y，但得到类似的错误。有人可以提出解决方案吗？谢谢。

_, y = np.unique(y, return_inverse=True)
x = np.unique(x, return_inverse=True)

详细错误：

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-57-abc7d45a71c2> in <module>()
      3 from sklearn import svm
      4 
----> 5 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0)
      6 
      7 

~\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py in train_test_split(*arrays, **options)
   2029         test_size = 0.25
   2030 
-> 2031     arrays = indexable(*arrays)
   2032 
   2033     if shuffle is False:

~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in indexable(*iterables)
    227         else:
    228             result.append(np.array(X))
--> 229     check_consistent_length(*result)
    230     return result
    231 

~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
    202     if len(uniques) > 1:
    203         raise ValueError("Found input variables with inconsistent numbers of"
--> 204                          " samples: %r" % [int(l) for l in lengths])
    205 
    206 

ValueError: Found input variables with inconsistent numbers of samples: [2, 214]

但是，如果我删除x_train和y_train，则会出现以下错误：

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-58-1009080f56c2> in <module>()
     22 
     23 fig, ax = plt.subplots()
---> 24 model = svm.LinearSVC(C=1).fit(x,y)
     25 
     26 # get the separating hyperplane

~\Anaconda3\lib\site-packages\sklearn\svm\classes.py in fit(self, X, y, sample_weight)
    225 
    226         X, y = check_X_y(X, y, accept_sparse='csr',
--> 227                          dtype=np.float64, order="C")
    228         check_classification_targets(y)
    229         self.classes_ = np.unique(y)

~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    571     X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
    572                     ensure_2d, allow_nd, ensure_min_samples,
--> 573                     ensure_min_features, warn_on_dtype, estimator)
    574     if multi_output:
    575         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,

~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    431                                       force_all_finite)
    432     else:
--> 433         array = np.array(array, dtype=dtype, order=order, copy=copy)
    434 
    435         if ensure_2d:

ValueError: setting an array element with a sequence.

通过线性SVM模型进行的超平面图

0 个答案: