我是python中的新手,我试图标记化并阻止推文创建模型,然后使用gridsearch查找最佳超参数,我可以接受任何类型的反馈
这是我的代码:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')
from string import punctuation
non_words = list(punctuation)
#we add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
# remove non letters
text = ''.join([c for c in text if c not in non_words])
# tokenize
tokens = word_tokenize(text)
# stem
try:
stems = stem_tokens(tokens, stemmer)
except Exception as e:
print(e)
print(text)
stems = ['']
return stems
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
tweets_corpus = tweets_corpus[tweets_corpus.polarity != 'NEU']
tweets_corpus['polarity_bin'] = 0
tweets_corpus.polarity_bin[tweets_corpus.polarity.isin(['P', 'P+'])] = 1
print(tweets_corpus.polarity_bin.value_counts(normalize=True))
if __name__ == '__main__':
import tokenize
vectorizer = CountVectorizer(
analyzer = 'word',
tokenizer = tokenize,
lowercase = True,
stop_words = spanish_stopwords)
pipeline = Pipeline([
('vect', vectorizer),
('cls', LinearSVC()),
])
parameters = {
'vect__max_df': (0.5, 1.9),
'vect__min_df': (10, 20,50),
'vect__max_features': (500, 1000),
'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
'cls__C': (0.2, 0.5, 0.7),
'cls__loss': ('hinge', 'squared_hinge'),
'cls__max_iter': (500, 1000)
}
from time import time
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,scoring='roc_auc')
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
grid_search.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
print(grid_search.best_params_)
t0 = time()
print("done in %0.3fs" % (time() - t0))
这是我试图升级的数据样本
Name: polarity_bin, dtype: float64
agreement \
270 NaN
208 NaN
902 NaN
31056 NaN
1158 NaN
content \
270 @revolucion2017 @Pablo_Iglesias_ Cultura es reflexionar sobre algo q ha dicho alguien y si te gusta hacerlo tuyo.pq no?
208 @_UnaOpinionMas_ @PPopular En eso estoi de acuerdo por lo menos al PP se le ve que hace cosas y contara d nuevo cn mi voto como siempre.
902 "Grande Casillas : ""Esta victoria no solo es nuestra sino también de Jesé ."""
31056 ¿Querían que Contador analizara cualquier cosa que fuera a tomar o que la vomitara meses después para mandarla al puto laboratorio?
1158 Eliminados de champion , van terceros en la Liga y pierden la final copa del Rey , PURO REAL MADRID
polarity polarity_bin
270 P 1
208 P 1
902 P 1
31056 N 0
1158 N 0
这就是错误:
TypeError Traceback (most recent call last)
<ipython-input-9-7c9b6a1bac93> in <module>()
201 print("Performing grid search...")
202 print("pipeline:", [name for name, _ in pipeline.steps])
--> 203 grid_search.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
204 print(grid_search.best_params_)
205 t0 = time()
C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\grid_search.py in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\grid_search.py in _fit(self, X, y, parameter_iterable)
539 n_candidates * len(cv)))
540
--> 541 base_estimator = clone(self.estimator)
542
543 pre_dispatch = self.pre_dispatch
C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe)
49 new_object_params = estimator.get_params(deep=False)
50 for name, param in six.iteritems(new_object_params):
---> 51 new_object_params[name] = clone(param, safe=False)
52 new_object = klass(**new_object_params)
53 params_set = new_object.get_params(deep=False)
C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe)
37 # XXX: not handling dictionaries
38 if estimator_type in (list, tuple, set, frozenset):
---> 39 return estimator_type([clone(e, safe=safe) for e in estimator])
40 elif not hasattr(estimator, 'get_params'):
41 if not safe:
C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in <listcomp>(.0)
37 # XXX: not handling dictionaries
38 if estimator_type in (list, tuple, set, frozenset):
---> 39 return estimator_type([clone(e, safe=safe) for e in estimator])
40 elif not hasattr(estimator, 'get_params'):
41 if not safe:
C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe)
37 # XXX: not handling dictionaries
38 if estimator_type in (list, tuple, set, frozenset):
---> 39 return estimator_type([clone(e, safe=safe) for e in estimator])
40 elif not hasattr(estimator, 'get_params'):
41 if not safe:
C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in <listcomp>(.0)
37 # XXX: not handling dictionaries
38 if estimator_type in (list, tuple, set, frozenset):
---> 39 return estimator_type([clone(e, safe=safe) for e in estimator])
40 elif not hasattr(estimator, 'get_params'):
41 if not safe:
C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe)
49 new_object_params = estimator.get_params(deep=False)
50 for name, param in six.iteritems(new_object_params):
---> 51 new_object_params[name] = clone(param, safe=False)
52 new_object = klass(**new_object_params)
53 params_set = new_object.get_params(deep=False)
C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe)
40 elif not hasattr(estimator, 'get_params'):
41 if not safe:
---> 42 return copy.deepcopy(estimator)
43 else:
44 raise TypeError("Cannot clone object '%s' (type %s): "
C:\Users\Miguel\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
180 raise Error(
181 "un(deep)copyable object of type %s" % cls)
--> 182 y = _reconstruct(x, rv, 1, memo)
183
184 # If is its own copy, don't memoize.
C:\Users\Miguel\Anaconda3\lib\copy.py in _reconstruct(x, info, deep, memo)
296 if state:
297 if deep:
--> 298 state = deepcopy(state, memo)
299 if hasattr(y, '__setstate__'):
300 y.__setstate__(state)
C:\Users\Miguel\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
153 copier = _deepcopy_dispatch.get(cls)
154 if copier:
--> 155 y = copier(x, memo)
156 else:
157 try:
C:\Users\Miguel\Anaconda3\lib\copy.py in _deepcopy_dict(x, memo)
242 memo[id(x)] = y
243 for key, value in x.items():
--> 244 y[deepcopy(key, memo)] = deepcopy(value, memo)
245 return y
246 d[dict] = _deepcopy_dict
C:\Users\Miguel\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
180 raise Error(
181 "un(deep)copyable object of type %s" % cls)
--> 182 y = _reconstruct(x, rv, 1, memo)
183
184 # If is its own copy, don't memoize.
C:\Users\Miguel\Anaconda3\lib\copy.py in _reconstruct(x, info, deep, memo)
296 if state:
297 if deep:
--> 298 state = deepcopy(state, memo)
299 if hasattr(y, '__setstate__'):
300 y.__setstate__(state)
C:\Users\Miguel\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
153 copier = _deepcopy_dispatch.get(cls)
154 if copier:
--> 155 y = copier(x, memo)
156 else:
157 try:
C:\Users\Miguel\Anaconda3\lib\copy.py in _deepcopy_dict(x, memo)
242 memo[id(x)] = y
243 for key, value in x.items():
--> 244 y[deepcopy(key, memo)] = deepcopy(value, memo)
245 return y
246 d[dict] = _deepcopy_dict
C:\Users\Miguel\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
172 reductor = getattr(x, "__reduce_ex__", None)
173 if reductor:
--> 174 rv = reductor(4)
175 else:
176 reductor = getattr(x, "__reduce__", None)
TypeError: cannot serialize '_io.TextIOWrapper' object
感谢您的时间 BTW Im在Windows 10中工作并更新了所有工具