MemoryError cross_val_score Jupyter笔记本

时间:2019-02-16 12:40:34

标签: python machine-learning scikit-learn

我是编程和机器学习的新手。我正在做KNN和亚马逊美食评论的作业,但遇到此错误。

我的代码:

from sklearn.model_selection import train_test_split

Y = data['Score'].values
X_with_stop= data['Text_with_stop'].values
X_no_stop = data['New_Text'].values

X_with_stop_train, X_with_stop_test, y_train, y_test = train_test_split(X_with_stop, Y, test_size=0.33, shuffle=False)

print(X_with_stop_train.shape, y_train.shape,y_test.shape)

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_X_train_brute = vectorizer.fit_transform(X_with_stop_train)
bow_X_test_brute = vectorizer.transform(X_with_stop_test)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

neighbors = list(range(3,99,2))

cv_scores = []

for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
    scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print ("The optimal number of neighbors is %d" % optimal_k)

# plot misclassification error vs k
plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.title("Plot for K vs Error for Brute force algorithm")
plt.show()

输出:

(413629,) (413629,) (203729,)

我遇到的错误如下:

MemoryError                               Traceback (most recent call last)
<ipython-input-17-f1ce8e46a2a3> in <module>()
 43 for k in neighbors:
 44     knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
---> 45     scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
46     cv_scores.append(scores.mean())
47 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
340                                 n_jobs=n_jobs, verbose=verbose,
341                                 fit_params=fit_params,
--> 342                                 pre_dispatch=pre_dispatch)
343     return cv_results['test_score']
344 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
204             fit_params, return_train_score=return_train_score,
205             return_times=True)
--> 206         for train, test in cv.split(X, y, groups))
207 
208     if return_train_score:

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
780                 self._iterating = True
781             else:

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623                 return False
624             else:
--> 625                 self._dispatch(tasks)
626                 return True
627 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586         dispatch_timestamp = time.time()
587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
589         self._jobs.append(job)
    590 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    486         fit_time = time.time() - start_time
    487         # _score will return dict if is_multimetric is True
--> 488         test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
    489         score_time = time.time() - start_time - fit_time
    490         if return_train_score:

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
    521     """
    522     if is_multimetric:
--> 523         return _multimetric_score(estimator, X_test, y_test, scorer)
    524     else:
    525         if y_test is None:

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
    551             score = scorer(estimator, X_test)
    552         else:
--> 553             score = scorer(estimator, X_test, y_test)
    554 
    555         if hasattr(score, 'item'):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, estimator, X, y_true, sample_weight)
     99         super(_PredictScorer, self).__call__(estimator, X, y_true,
    100                                              sample_weight=sample_weight)
--> 101         y_pred = estimator.predict(X)
    102         if sample_weight is not None:
    103             return self._sign * self._score_func(y_true, y_pred,

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\classification.py in predict(self, X)
    143         X = check_array(X, accept_sparse='csr')
    144 
--> 145         neigh_dist, neigh_ind = self.kneighbors(X)
    146 
    147         classes_ = self.classes_

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\base.py in kneighbors(self, X, n_neighbors, return_distance)
    355             if self.effective_metric_ == 'euclidean':
    356                 dist = pairwise_distances(X, self._fit_X, 'euclidean',
--> 357                                           n_jobs=n_jobs, squared=True)
    358             else:
    359                 dist = pairwise_distances(

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in pairwise_distances(X, Y, metric, n_jobs, **kwds)
   1245         func = partial(distance.cdist, metric=metric, **kwds)
   1246 
-> 1247     return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
   1248 
   1249 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
   1088     if n_jobs == 1:
   1089         # Special case to avoid picklability checks in delayed
-> 1090         return func(X, Y, **kwds)
   1091 
   1092     # TODO: in some cases, backend='threading' may be appropriate

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in euclidean_distances(X, Y, Y_norm_squared, squared, X_norm_squared)
    244         YY = row_norms(Y, squared=True)[np.newaxis, :]
    245 
--> 246     distances = safe_sparse_dot(X, Y.T, dense_output=True)
    247     distances *= -2
    248     distances += XX

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\extmath.py in safe_sparse_dot(a, b, dense_output)
    133     """
    134     if issparse(a) or issparse(b):
--> 135         ret = a * b
    136         if dense_output and hasattr(ret, "toarray"):
    137             ret = ret.toarray()

C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\base.py in __mul__(self, other)
    477             if self.shape[1] != other.shape[0]:
    478                 raise ValueError('dimension mismatch')
--> 479             return self._mul_sparse_matrix(other)
    480 
    481         # If it's a list or whatever, treat it like a matrix

C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\compressed.py in _mul_sparse_matrix(self, other)
    500                                     maxval=nnz)
    501         indptr = np.asarray(indptr, dtype=idx_dtype)
--> 502         indices = np.empty(nnz, dtype=idx_dtype)
    503         data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
    504 

1 个答案:

答案 0 :(得分:0)

import plotly.plotly as py import plotly import plotly.figure_factory as ff from plotly.offline import download_plotlyjs, init_notebook_mode, plot,iplot init_notebook_mode(connected=True) flow = color_table.iloc[:,3] data = [dict(type = 'choroplet', colorscale = 'Rainbow', location = color_table['COUNTRY_NAME_IT_y'], z = flow, text = color_table['COUNTRY_NAME_IT_x'], colorbar = dict(title = 'Flow of foreign users', titlefont=dict(size=25),tickfont=dict(size=18)), )] layout = dict(title = 'Flow of foreign users', geo = dict(showframe = False,showcoastlines = False,projection = dict(type = 'equirectangular')) ) fig = dict(layout=layout, data=data) iplot(fig,validate=False, filename='d3-world-map') 通常意味着您用完了RAM。看到数据集的大小,我认为这可能是一个合理的解释。

可以肯定的是,只需在执行代码时查看一下RAM的使用情况即可。