我是python的新手,尝试使用文本分类。但是,每当我尝试拟合我的数据时,我都会收到错误,很清楚我做错了什么,但我不确定它是什么。有人可以解释我得到的错误吗?
from nltk.corpus import CategorizedPlaintextCorpusReader
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#assign each text files contained in a folder named : 'x' with the category 'x'.
#result of this line is capturing manual categorization of corpus into 0, 1, 2 classes, to train later.
#Training Folder conatin three folders of classes: 0, 1, 2, each is treated by reader as a category
#and each folder contain 100 files in txt formar.
reader = CategorizedPlaintextCorpusReader('~/CorpusMain/Training/',
r'.*\.txt', cat_pattern=r'(\w+)/*')
text = []
for file in reader.fileids():
text.append(reader.raw(fileids=file))
myarray = np.asarray(text)
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(myarray, reader.categories())
print(X_train_tfidf.shape)
print(X_train_tfidf)
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LinearSVC()),])
text_clf = text_clf.fit(text, reader.categories())
这是我的全部输出:
(300, 35452)
(0, 13656) 0.116989332372
(0, 23881) 0.198926409043
(0, 16891) 0.0976295590378
(0, 12971) 0.227522474834
(0, 3584) 0.00403800859194
(0, 17447) 0.116688675696
(0, 7976) 0.00361099027578
(0, 4949) 0.200037729764
(0, 12900) 0.0470607385753
(0, 8365) 0.170168848649
(0, 35116) 0.00591313296587
(0, 34532) 0.0300031821031
(0, 28444) 0.0103225881505
(0, 7775) 0.0118262659317
(0, 35347) 0.00591313296587
(0, 31378) 0.0095967062934
(0, 31367) 0.00233592956364
(0, 32288) 0.00673471544479
(0, 8532) 0.00403800859194
(0, 34621) 0.0073385595246
(0, 26664) 0.00430397987711
(0, 11117) 0.00556197716333
(0, 31960) 0.00637226762455
(0, 14383) 0.00588259232191
(0, 26636) 0.00546461963895
: :
(299, 30776) 0.000463412877075
(299, 3574) 0.000926825754151
(299, 31792) 0.000463412877075
(299, 3531) 0.000463412877075
(299, 3510) 0.000463412877075
(299, 6746) 0.000463412877075
(299, 1039) 0.000463412877075
(299, 5055) 0.000463412877075
(299, 15334) 0.000463412877075
(299, 3591) 0.000463412877075
(299, 3624) 0.000463412877075
(299, 1468) 0.000463412877075
(299, 791) 0.000463412877075
(299, 3633) 0.000463412877075
(299, 1260) 0.000463412877075
(299, 1709) 0.000463412877075
(299, 1717) 0.000463412877075
(299, 862) 0.000463412877075
(299, 854) 0.000463412877075
(299, 19415) 0.000463412877075
(299, 5142) 0.000463412877075
(299, 4408) 0.000463412877075
(299, 27294) 0.000463412877075
(299, 15736) 0.000463412877075
(299, 23403) 0.000926825754151
Traceback (most recent call last):
File "/Users/nouraabdulaziz/PycharmProjects/untitled1/a.py", line 31, in <module>
text_clf = text_clf.fit(text, reader.categories())
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/pipeline.py", line 270, in fit
self._final_estimator.fit(Xt, y, **fit_params)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/svm/classes.py", line 207, in fit
dtype=np.float64, order="C")
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/utils/validation.py", line 531, in check_X_y
check_consistent_length(X, y)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/utils/validation.py", line 181, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError: Found input variables with inconsistent numbers of samples: [300, 3]
Process finished with exit code 1