
时间:2015-10-05 04:36:37

标签: python numpy scikit-learn


# Function for Stochastic Gradient Descent SVM
def SGD(k_fold,train_X,train_Y):
        """ Method to implement Multi-class SVM using 
            Stochastic Gradient Descent

        from sklearn.linear_model import SGDClassifier
        scores_sgd = []

        for train_indices, test_indices in k_fold:
            train_X_cv = train_X[train_indices]
            train_Y_cv = train_Y[train_indices]

            test_X_cv  = train_X[test_indices]
            test_Y_cv  = train_Y[test_indices]

            sgd = SGDClassifier( loss = 'hinge', penalty = 'l2' )
            scores_sgd.append( sgd.fit( train_X_cv, train_Y_cv ).score( test_X_cv, test_Y_cv ) )

        print( "The mean accuracy of Stochastic Gradient Descent Classifier on CV data is:", np.mean( scores_sgd ) )

        return sgd

这是 k_folds 数据的样子:

for a,b in k_fold:
    print len(a), len(b)

我甚至检查了我的 train_X train_Y NaN

>>> numpy.isnan( train_X ).any()


ValueError                                Traceback (most recent call last)
<ipython-input-22-a2f71582edd0> in <module>()
      1 # Running SGD. and RF
----> 3 sgd=SGD(k_fold,train_X,train_Y)
      4 #rf=RF(k_fold,train_X,train_Y)

<ipython-input-12-7e3b6395f3d6> in SGD(k_fold, train_X, train_Y)
     63             sgd=SGDClassifier(loss='hinge',penalty='l2')
---> 64             scores_sgd.append(sgd.fit(train_X_cv,train_Y_cv).score(test_X_cv,test_Y_cv))
     66         print("The mean accuracy of Stochastic Gradient Descent Classifier on CV data is:", np.mean(scores_sgd))

/Users/mtripathi/anaconda/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.pyc in fit(self, X, y, coef_init, intercept_init, class_weight, sample_weight)
    562                          loss=self.loss, learning_rate=self.learning_rate,
    563                          coef_init=coef_init, intercept_init=intercept_init,
--> 564                          sample_weight=sample_weight)

/Users/mtripathi/anaconda/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.pyc in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
    401             self.classes_ = None
--> 403         X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C")
    404         n_samples, n_features = X.shape

/Users/mtripathi/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric)
    448     else:
    449         y = column_or_1d(y, warn=True)
--> 450         _assert_all_finite(y)
    451     if y_numeric and y.dtype.kind == 'O':
    452         y = y.astype(np.float64)

/Users/  ---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-22-a2f71582edd0> in <module>()
      1 # Running SGD. and RF
----> 3 sgd=SGD(k_fold,train_X,train_Y)
      4 #rf=RF(k_fold,train_X,train_Y)

<ipython-input-12-7e3b6395f3d6> in SGD(k_fold, train_X, train_Y)
     63             sgd=SGDClassifier(loss='hinge',penalty='l2')
---> 64             scores_sgd.append(sgd.fit(train_X_cv,train_Y_cv).score(test_X_cv,test_Y_cv))
     66         print("The mean accuracy of Stochastic Gradient Descent Classifier on CV data is:", np.mean(scores_sgd))

/Users//anaconda/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.pyc in fit(self, X, y, coef_init, intercept_init, class_weight, sample_weight)
    562                          loss=self.loss, learning_rate=self.learning_rate,
    563                          coef_init=coef_init, intercept_init=intercept_init,
--> 564                          sample_weight=sample_weight)

/Users//anaconda/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.pyc in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
    401             self.classes_ = None
--> 403         X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C")
    404         n_samples, n_features = X.shape

/Users//anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric)
    448     else:
    449         y = column_or_1d(y, warn=True)
--> 450         _assert_all_finite(y)
    451     if y_numeric and y.dtype.kind == 'O':
    452         y = y.astype(np.float64)

/Users//anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in _assert_all_finite(X)
     50             and not np.isfinite(X).all()):
     51         raise ValueError("Input contains NaN, infinity"
---> 52                          " or a value too large for %r." % X.dtype)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').tr/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in _assert_all_finite(X)
     50             and not np.isfinite(X).all()):
     51         raise ValueError("Input contains NaN, infinity"
---> 52                          " or a value too large for %r." % X.dtype)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

请帮忙。我 random forest 但它正在杀死内核我不知道为什么。

1 个答案:

答案 0 :(得分:0)


np.nan_to_num( ... ).astype( np.float32 ) # may help to prototope the methods

Replace nan with zero and inf with finite numbers.

Returns an array or scalar replacing Not a Number (NaN) with zero,
(positive) infinity with a very large number and negative infinity
with a very small (or negative) number.

x : array_like
    Input data.

out : ndarray, float
    Array with the same shape as `x` and dtype of the element in `x`  with
    the greatest precision. NaN is replaced by zero, and infinity
    (-infinity) is replaced by the largest (smallest or most negative)
    floating point value that fits in the output dtype. All finite numbers
    are upcast to the output dtype (default float64).

See Also
isinf    : Shows which elements are negative or negative infinity.
isneginf : Shows which elements are negative infinity.
isposinf : Shows which elements are positive infinity.
isnan    : Shows which elements are Not a Number (NaN).
isfinite : Shows which elements are finite (not NaN, not infinity)

Numpy uses the IEEE Standard for Binary Floating-Point for Arithmetic
(IEEE 754). This means that Not a Number is not equivalent to infinity.

>>> np.set_printoptions(precision=8)
>>> x = np.array([np.inf, -np.inf, np.nan, -128, 128])
>>> np.nan_to_num(x)
array([  1.79769313e+308,  -1.79769313e+308,   0.00000000e+000,
        -1.28000000e+002,   1.28000000e+002])


一旦O / S开始击落RandomForest / SVM内核处理

1)修改可用(O / S相关) RAM大小/虚拟内存管理选项并尽可能升级路径。

2)减少您的DataSET表示,从 np.float64 向下延伸到 np.float32

3)避免RAM内重复并避免python-memory-manager不愿意将mem-alloc返回到O / S重用,可能导致进入 numpy 非in-RAM矩阵表示(内存映射文件)。

4)最后,靠近RAM边界,可能会受益于使用 RandomForest 增量构建( warm = True

如果您的localhost O / S不允许所需的大小,请将您的处理移至不同的O / S平台/云托管处理,其中RAM大小和O / S允许处理更大的RAM内对象。


def getExtMemoryUsed():                                # msMOD.DEBUG.self ( ref. 2GB Memory Crashes in python + wXP /3GB [BOOT.ini]-options.............................................................
    import os, psutil
    aMemoryINFO     = psutil.Process( os.getpid() ).get_ext_memory_info()
    anItemLIST      = [ ( "num_page_faults",    "x"  ),
                        ( "peak_wset",          "MB" ),
                        ( "wset",               "MB" ),
                        ( "peak_paged_pool",    "MB" ),
                        ( "paged_pool",         "MB" ),
                        ( "peak_nonpaged_pool", "MB" ),
                        ( "nonpaged_pool",      "MB" ),
                        ( "pagefile",           "MB" ),
                        ( "peak_pagefile",      "MB" ),
                        ( "private",            "MB" )
    aListOfShowIDs  = [ 8, 7, 9, 1, 2, 3, 4, 5, 6, 0 ]
    for i, id in enumerate( aListOfShowIDs[:-1] ):
        print     "{0: > 16.3f} {2:_>20s} {1: >3s} ".format(     aMemoryINFO[id] / float( 2**20 ), anItemLIST[id][1], anItemLIST[id][0] )
    print     "                 {2:_>20s} {0: >11d} x\n".format( aMemoryINFO[ 0],                                 "", anItemLIST[ 0][0] )

    |>>> getExtMemoryUsed()
        1534.012 _______peak_pagefile  MB
        1195.777 ____________pagefile  MB
        1195.777 _____________private  MB
        1119.109 ___________peak_wset  MB
         662.457 ________________wset  MB
           0.321 _____peak_paged_pool  MB
           0.321 __________paged_pool  MB
           0.084 __peak_nonpaged_pool  MB
           0.083 _______nonpaged_pool  MB
                 _____num_page_faults     3727767 x

pextmem(num_page_faults=3727767, peak_wset=1173471232, wset=694636544, peak_paged_pool=336316, paged_pool=336172, peak_nonpaged_pool=87968, nonpaged_pool=87480, pagefile=1253863424, peak_pagefile=1608527872, private=1253863424)
    return aMemoryINFO

