看起来我有些不一致,但我无法排除故障。
def train(classifier, X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
print(X_train.shape)
print(y_train.shape)
classifier.fit(X_train, y_train)
print ("Accuracy: %s" % classifier.score(X_test, y_test))
return classifier
def load_data_and_labels(filename):
"""Load sentences and labels"""
df = pd.read_csv(filename, compression='zip')
columns = df.columns.tolist()
columns = [c for c in columns if c not in ["col1","_id", "col2", "col3"]]
target = "target_col"
df= df.dropna(axis=0, how='any', subset=columns) # Drop null rows
# x_raw = df[columns]
# y_raw = df[target]
return df[columns], df[target],df
input_file = '.file.zip'
x_raw, y_raw,df=load_data_and_labels(input_file)
trial1 = Pipeline([
('vectorizer', TfidfVectorizer()),
('classifier', MultinomialNB()),
])
train(trial1, x_raw, y_raw)
我收到了以下错误
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-411-be9c83097d5b> in <module>()
5 ])
6
----> 7 train(trial1, x_raw, y_raw)
<ipython-input-407-33b2ec7fc112> in train(classifier, X, y)
3 print(X_train.shape)
4 print(y_train.shape)
----> 5 classifier.fit(X_train, y_train)
6
7
/home/manisha/anaconda3/lib/python3.5/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
--> 270 self._final_estimator.fit(Xt, y, **fit_params)
271 return self
272
/home/manisha/anaconda3/lib/python3.5/site-packages/sklearn/naive_bayes.py in fit(self, X, y, sample_weight)
560 Returns self.
561 """
--> 562 X, y = check_X_y(X, y, 'csr')
563 _, n_features = X.shape
564
/home/manisha/anaconda3/lib/python3.5/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
529 y = y.astype(np.float64)
530
--> 531 check_consistent_length(X, y)
532
533 return X, y
/home/manisha/anaconda3/lib/python3.5/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
179 if len(uniques) > 1:
180 raise ValueError("Found input variables with inconsistent numbers of"
--> 181 " samples: %r" % [int(l) for l in lengths])
182
183
ValueError: Found input variables with inconsistent numbers of samples: [3, 2997]
我的X.train和y.train的形状是(2997,3) (2997,)接受。