找到具有不一致样本数的输入变量

时间:2017-04-02 21:10:32

标签: scikit-learn sklearn-pandas

看起来我有些不一致,但我无法排除故障。

def train(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
    print(X_train.shape) 
    print(y_train.shape)
    classifier.fit(X_train, y_train)


    print ("Accuracy: %s" % classifier.score(X_test, y_test))
    return classifier


def load_data_and_labels(filename):
    """Load sentences and labels"""
    df = pd.read_csv(filename, compression='zip')
    columns = df.columns.tolist()
    columns = [c for c in columns if c not in ["col1","_id", "col2", "col3"]]
    target = "target_col"
    df= df.dropna(axis=0, how='any', subset=columns) # Drop null rows
    # x_raw = df[columns]
    # y_raw = df[target]
    return df[columns], df[target],df

input_file = '.file.zip'
x_raw, y_raw,df=load_data_and_labels(input_file)




trial1 = Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('classifier', MultinomialNB()),
    ])

train(trial1, x_raw, y_raw)

我收到了以下错误

  ---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-411-be9c83097d5b> in <module>()
      5 ])
      6 
----> 7 train(trial1, x_raw, y_raw)

<ipython-input-407-33b2ec7fc112> in train(classifier, X, y)
      3     print(X_train.shape)
      4     print(y_train.shape)
----> 5     classifier.fit(X_train, y_train)
      6 
      7 

/home/manisha/anaconda3/lib/python3.5/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    268         Xt, fit_params = self._fit(X, y, **fit_params)
    269         if self._final_estimator is not None:
--> 270             self._final_estimator.fit(Xt, y, **fit_params)
    271         return self
    272 

/home/manisha/anaconda3/lib/python3.5/site-packages/sklearn/naive_bayes.py in fit(self, X, y, sample_weight)
    560             Returns self.
    561         """
--> 562         X, y = check_X_y(X, y, 'csr')
    563         _, n_features = X.shape
    564 

/home/manisha/anaconda3/lib/python3.5/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    529         y = y.astype(np.float64)
    530 
--> 531     check_consistent_length(X, y)
    532 
    533     return X, y

/home/manisha/anaconda3/lib/python3.5/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
    179     if len(uniques) > 1:
    180         raise ValueError("Found input variables with inconsistent numbers of"
--> 181                          " samples: %r" % [int(l) for l in lengths])
    182 
    183 

ValueError: Found input variables with inconsistent numbers of samples: [3, 2997]

我的X.train和y.train的形状是(2997,3)     (2997,)接受。

0 个答案:

没有答案