在python中运行SVM时出现NaN错误

时间:2015-10-05 04:36:37

标签: python numpy scikit-learn

我正在运行以下代码。

# Function for Stochastic Gradient Descent SVM
def SGD(k_fold,train_X,train_Y):
        """ Method to implement Multi-class SVM using 
            Stochastic Gradient Descent
        """

        from sklearn.linear_model import SGDClassifier
        scores_sgd = []

        for train_indices, test_indices in k_fold:
            train_X_cv = train_X[train_indices]
            train_Y_cv = train_Y[train_indices]

            test_X_cv  = train_X[test_indices]
            test_Y_cv  = train_Y[test_indices]

            sgd = SGDClassifier( loss = 'hinge', penalty = 'l2' )
            scores_sgd.append( sgd.fit( train_X_cv, train_Y_cv ).score( test_X_cv, test_Y_cv ) )

        print( "The mean accuracy of Stochastic Gradient Descent Classifier on CV data is:", np.mean( scores_sgd ) )

        return sgd

这是 k_folds 数据的样子:

for a,b in k_fold:
    print len(a), len(b)

我甚至检查了我的 train_X train_Y NaN

>>> numpy.isnan( train_X ).any()
False

但我仍然收到此错误

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-22-a2f71582edd0> in <module>()
      1 # Running SGD. and RF
      2 
----> 3 sgd=SGD(k_fold,train_X,train_Y)
      4 #rf=RF(k_fold,train_X,train_Y)

<ipython-input-12-7e3b6395f3d6> in SGD(k_fold, train_X, train_Y)
     62 
     63             sgd=SGDClassifier(loss='hinge',penalty='l2')
---> 64             scores_sgd.append(sgd.fit(train_X_cv,train_Y_cv).score(test_X_cv,test_Y_cv))
     65 
     66         print("The mean accuracy of Stochastic Gradient Descent Classifier on CV data is:", np.mean(scores_sgd))

/Users/mtripathi/anaconda/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.pyc in fit(self, X, y, coef_init, intercept_init, class_weight, sample_weight)
    562                          loss=self.loss, learning_rate=self.learning_rate,
    563                          coef_init=coef_init, intercept_init=intercept_init,
--> 564                          sample_weight=sample_weight)
    565 
    566 

/Users/mtripathi/anaconda/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.pyc in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
    401             self.classes_ = None
    402 
--> 403         X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C")
    404         n_samples, n_features = X.shape
    405 

/Users/mtripathi/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric)
    448     else:
    449         y = column_or_1d(y, warn=True)
--> 450         _assert_all_finite(y)
    451     if y_numeric and y.dtype.kind == 'O':
    452         y = y.astype(np.float64)

/Users/  ---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-22-a2f71582edd0> in <module>()
      1 # Running SGD. and RF
      2 
----> 3 sgd=SGD(k_fold,train_X,train_Y)
      4 #rf=RF(k_fold,train_X,train_Y)

<ipython-input-12-7e3b6395f3d6> in SGD(k_fold, train_X, train_Y)
     62 
     63             sgd=SGDClassifier(loss='hinge',penalty='l2')
---> 64             scores_sgd.append(sgd.fit(train_X_cv,train_Y_cv).score(test_X_cv,test_Y_cv))
     65 
     66         print("The mean accuracy of Stochastic Gradient Descent Classifier on CV data is:", np.mean(scores_sgd))

/Users//anaconda/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.pyc in fit(self, X, y, coef_init, intercept_init, class_weight, sample_weight)
    562                          loss=self.loss, learning_rate=self.learning_rate,
    563                          coef_init=coef_init, intercept_init=intercept_init,
--> 564                          sample_weight=sample_weight)
    565 
    566 

/Users//anaconda/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.pyc in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
    401             self.classes_ = None
    402 
--> 403         X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C")
    404         n_samples, n_features = X.shape
    405 

/Users//anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric)
    448     else:
    449         y = column_or_1d(y, warn=True)
--> 450         _assert_all_finite(y)
    451     if y_numeric and y.dtype.kind == 'O':
    452         y = y.astype(np.float64)

/Users//anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in _assert_all_finite(X)
     50             and not np.isfinite(X).all()):
     51         raise ValueError("Input contains NaN, infinity"
---> 52                          " or a value too large for %r." % X.dtype)
     53 
     54 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').tr/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in _assert_all_finite(X)
     50             and not np.isfinite(X).all()):
     51         raise ValueError("Input contains NaN, infinity"
---> 52                          " or a value too large for %r." % X.dtype)
     53 
     54 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

请帮忙。我 random forest 但它正在杀死内核我不知道为什么。

1 个答案:

答案 0 :(得分:0)

如果与未经过消毒的DataSET挣扎:

np.nan_to_num( ... ).astype( np.float32 ) # may help to prototope the methods


"""
Replace nan with zero and inf with finite numbers.

Returns an array or scalar replacing Not a Number (NaN) with zero,
(positive) infinity with a very large number and negative infinity
with a very small (or negative) number.

Parameters
----------
x : array_like
    Input data.

Returns
-------
out : ndarray, float
    Array with the same shape as `x` and dtype of the element in `x`  with
    the greatest precision. NaN is replaced by zero, and infinity
    (-infinity) is replaced by the largest (smallest or most negative)
    floating point value that fits in the output dtype. All finite numbers
    are upcast to the output dtype (default float64).

See Also
--------
isinf    : Shows which elements are negative or negative infinity.
isneginf : Shows which elements are negative infinity.
isposinf : Shows which elements are positive infinity.
isnan    : Shows which elements are Not a Number (NaN).
isfinite : Shows which elements are finite (not NaN, not infinity)

Notes
-----
Numpy uses the IEEE Standard for Binary Floating-Point for Arithmetic
(IEEE 754). This means that Not a Number is not equivalent to infinity.


Examples
--------
>>> np.set_printoptions(precision=8)
>>> x = np.array([np.inf, -np.inf, np.nan, -128, 128])
>>> np.nan_to_num(x)
array([  1.79769313e+308,  -1.79769313e+308,   0.00000000e+000,
        -1.28000000e+002,   1.28000000e+002])

"""

一旦O / S开始击落RandomForest / SVM内核处理

1)修改可用(O / S相关) RAM大小/虚拟内存管理选项并尽可能升级路径。

2)减少您的DataSET表示,从 np.float64 向下延伸到 np.float32

3)避免RAM内重复并避免python-memory-manager不愿意将mem-alloc返回到O / S重用,可能导致进入 numpy 非in-RAM矩阵表示(内存映射文件)。

4)最后,靠近RAM边界,可能会受益于使用 RandomForest 增量构建( warm = True

如果您的localhost O / S不允许所需的大小,请将您的处理移至不同的O / S平台/云托管处理,其中RAM大小和O / S允许处理更大的RAM内对象。

用于获取python内存使用情况的小型实用程序

def getExtMemoryUsed():                                # msMOD.DEBUG.self ( ref. 2GB Memory Crashes in python + wXP /3GB [BOOT.ini]-options.............................................................
    import os, psutil
    aMemoryINFO     = psutil.Process( os.getpid() ).get_ext_memory_info()
    anItemLIST      = [ ( "num_page_faults",    "x"  ),
                        ( "peak_wset",          "MB" ),
                        ( "wset",               "MB" ),
                        ( "peak_paged_pool",    "MB" ),
                        ( "paged_pool",         "MB" ),
                        ( "peak_nonpaged_pool", "MB" ),
                        ( "nonpaged_pool",      "MB" ),
                        ( "pagefile",           "MB" ),
                        ( "peak_pagefile",      "MB" ),
                        ( "private",            "MB" )
                        ]
    aListOfShowIDs  = [ 8, 7, 9, 1, 2, 3, 4, 5, 6, 0 ]
    for i, id in enumerate( aListOfShowIDs[:-1] ):
        print     "{0: > 16.3f} {2:_>20s} {1: >3s} ".format(     aMemoryINFO[id] / float( 2**20 ), anItemLIST[id][1], anItemLIST[id][0] )
    print     "                 {2:_>20s} {0: >11d} x\n".format( aMemoryINFO[ 0],                                 "", anItemLIST[ 0][0] )

    """
    |>>> getExtMemoryUsed()
        1534.012 _______peak_pagefile  MB
        1195.777 ____________pagefile  MB
        1195.777 _____________private  MB
        1119.109 ___________peak_wset  MB
         662.457 ________________wset  MB
           0.321 _____peak_paged_pool  MB
           0.321 __________paged_pool  MB
           0.084 __peak_nonpaged_pool  MB
           0.083 _______nonpaged_pool  MB
                 _____num_page_faults     3727767 x

pextmem(num_page_faults=3727767, peak_wset=1173471232, wset=694636544, peak_paged_pool=336316, paged_pool=336172, peak_nonpaged_pool=87968, nonpaged_pool=87480, pagefile=1253863424, peak_pagefile=1608527872, private=1253863424)
    """
    return aMemoryINFO

这个微小的代码比完全成熟的内存分析器附件需要更少的内存,同时帮助您诊断代码已经到达RAM内边界的距离/距离。

对于鹰派的pythoneers,该帖子故意使用非PEP-8源代码格式,因为作者体验到在学习阶段,代码读取能力提高了对任务解决方案的关注,并有助于习惯于底层概念而不是花费在正式的印刷术上。希望提供帮助的原则得到尊重,非PEP-8样式格式以易于阅读的名义被宽恕。