from nltk.corpus import CategorizedPlaintextCorpusReader
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#assign each text files contained in a folder named : 'x' with the category 'x'.
#result of this line is capturing manual categorization of corpus into 0, 1, 2 classes, to train later.
#Training Folder conatin three folders of classes: 0, 1, 2, each is treated by reader as a category
#and each folder contain 100 files in txt formar.
reader = CategorizedPlaintextCorpusReader('~/CorpusMain/Training/',
r'.*\.txt', cat_pattern=r'(\w+)/*')
text = []
for file in reader.fileids():
myarray = np.asarray(text)
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(myarray, reader.categories())
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LinearSVC()),])
text_clf = text_clf.fit(text, reader.categories())
(300, 35452)
(0, 13656) 0.116989332372
(0, 23881) 0.198926409043
(0, 16891) 0.0976295590378
(0, 12971) 0.227522474834
(0, 3584) 0.00403800859194
(0, 17447) 0.116688675696
(0, 7976) 0.00361099027578
(0, 4949) 0.200037729764
(0, 12900) 0.0470607385753
(0, 8365) 0.170168848649
(0, 35116) 0.00591313296587
(0, 34532) 0.0300031821031
(0, 28444) 0.0103225881505
(0, 7775) 0.0118262659317
(0, 35347) 0.00591313296587
(0, 31378) 0.0095967062934
(0, 31367) 0.00233592956364
(0, 32288) 0.00673471544479
(0, 8532) 0.00403800859194
(0, 34621) 0.0073385595246
(0, 26664) 0.00430397987711
(0, 11117) 0.00556197716333
(0, 31960) 0.00637226762455
(0, 14383) 0.00588259232191
(0, 26636) 0.00546461963895
: :
(299, 30776) 0.000463412877075
(299, 3574) 0.000926825754151
(299, 31792) 0.000463412877075
(299, 3531) 0.000463412877075
(299, 3510) 0.000463412877075
(299, 6746) 0.000463412877075
(299, 1039) 0.000463412877075
(299, 5055) 0.000463412877075
(299, 15334) 0.000463412877075
(299, 3591) 0.000463412877075
(299, 3624) 0.000463412877075
(299, 1468) 0.000463412877075
(299, 791) 0.000463412877075
(299, 3633) 0.000463412877075
(299, 1260) 0.000463412877075
(299, 1709) 0.000463412877075
(299, 1717) 0.000463412877075
(299, 862) 0.000463412877075
(299, 854) 0.000463412877075
(299, 19415) 0.000463412877075
(299, 5142) 0.000463412877075
(299, 4408) 0.000463412877075
(299, 27294) 0.000463412877075
(299, 15736) 0.000463412877075
(299, 23403) 0.000926825754151
Traceback (most recent call last):
File "/Users/nouraabdulaziz/PycharmProjects/untitled1/a.py", line 31, in <module>
text_clf = text_clf.fit(text, reader.categories())
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/pipeline.py", line 270, in fit
self._final_estimator.fit(Xt, y, **fit_params)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/svm/classes.py", line 207, in fit
dtype=np.float64, order="C")
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/utils/validation.py", line 531, in check_X_y
check_consistent_length(X, y)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/utils/validation.py", line 181, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError: Found input variables with inconsistent numbers of samples: [300, 3]
Process finished with exit code 1