如何将多个分发列表传递给sklearn randomizedSearchCV

时间:2016-07-29 21:32:30

标签: python scipy scikit-learn shogun

我在Python中有一个自定义的估算器对象(mkl_regressor)。这种对象的学习参数之一是numpy.array个浮点数。通常,sklearn估计器对象由单个参数调整,例如SVM的C。因此,randomizedSearchCV搜索对象获取一个分布或值列表,用于从给定分布(在我的示例中为scipy.stats.expon)中获取所需参数的某些值。我试图传递一个发布列表,但我没有成功,因为randomizedSearchCV不执行发行版数组中的元素。这就是我试过的:

from modshogun import *
import Gnuplot, Gnuplot.funcutils
from numpy import *
from sklearn.metrics import r2_score

class mkl_regressor():

    def __init__(self, widths = [0.01, 0.1, 1.0, 10.0, 50.0, 100.0], kernel_weights = [0.01, 0.1, 1.0,], svm_c = 0.01, mkl_c = 1.0, svm_norm = 1, mkl_norm = 1, degree = 2):
        self.svm_c = svm_c
        self.mkl_c = mkl_c
        self.svm_norm = svm_norm
        self.mkl_norm = mkl_norm
        self.degree = degree
        self.widths = widths
        self.kernel_weights = kernel_weights


    def fit(self, X, y, **params):
        for parameter, value in params.items():
            setattr(self, parameter, value)        

        self.feats_train = RealFeatures(X.T)
        labels_train = RegressionLabels(y.reshape((len(y), )))
        self._kernels_  = CombinedKernel()
        for width in self.widths:
            kernel = GaussianKernel()
            kernel.set_width(width)
            kernel.init(self.feats_train,self.feats_train)
            self._kernels_.append_kernel(kernel)
            del kernel

        kernel = PolyKernel(10, self.degree)            
        self._kernels_.append_kernel(kernel)
        del kernel

        self._kernels_.init(self.feats_train, self.feats_train)

        binary_svm_solver = SVRLight()
        self.mkl = MKLRegression(binary_svm_solver)

        self.mkl.set_C(self.svm_c, self.svm_c)
        self.mkl.set_C_mkl(self.mkl_c)
        self.mkl.set_mkl_norm(self.mkl_norm)
        self.mkl.set_mkl_block_norm(self.svm_norm)

        self.mkl.set_kernel(self._kernels_)
        self.mkl.set_labels(labels_train)
        self.mkl.train()
        self.kernel_weights = self._kernels_.get_subkernel_weights()

    def predict(self, X):
        self.feats_test = RealFeatures(X.T)
        self._kernels_.init(self.feats_train, self.feats_test) 
        self.mkl.set_kernel(self._kernels_)
        return self.mkl.apply_regression().get_labels()

    def set_params(self, **params):
        for parameter, value in params.items():
            setattr(self, parameter, value)

        return self

    def get_params(self, deep=False):

        return {param: getattr(self, param) for param in dir(self) if not param.startswith('__') and not callable(getattr(self,param))}    

    def score(self,  X_t, y_t):

        predicted = self.predict(X_t)
        return r2_score(predicted, y_t)    

if __name__ == "__main__":

    from sklearn.grid_search import RandomizedSearchCV as RS
    from scipy.stats import randint as sp_randint
    from scipy.stats import expon

    labels = array([2.0,0.0,2.0,1.0,3.0,2.0])
    labels = labels.reshape((len(labels), 1))
    data = array([[1.0,2.0,3.0],[1.0,2.0,9.0],[1.0,2.0,3.0],[1.0,2.0,0.0],[0.0,2.0,3.0],[1.0,2.0,3.0]])
    labels_t = array([1.,3.,4])
    labels_t = labels_t.reshape((len(labels_t), 1))
    data_t = array([[20.0,30.0,40.0],[10.0,20.0,30.0],[10.0,20.0,40.0]])
    k = 3

    param_grid = [ {'svm_c': expon(scale=100, loc=5),
                'mkl_c': expon(scale=100, loc=5),
                'degree': sp_randint(0, 32),
                #'widths': [array([4.0,6.0,8.9,3.0]), array([4.0,6.0,8.9,3.0,2.0, 3.0, 4.0]), array( [100.0, 200.0, 300.0, 400.0]) 
                'widths': [[expon, expon]] 
              }]

    mkl = mkl_regressor()
    rs = RS(mkl, param_distributions = param_grid[0], n_iter = 10, n_jobs = 24, cv = k)#, scoring="r2", verbose=True)
    rs.fit(data, labels)
    preds = rs.predict(data_t)

    print "R^2: ", rs.score(data_t, labels_t)
    print "Parameters: ", rs.best_params_

通过将numpy数组作为参数字典的列表'widths'的元素传递,上面的代码很有效。但是,当我尝试传递分发列表时,randomizedSearchCV对象没有按照需要响应:

/home/ignacio/distributionalSemanticStabilityThesis/mkl_test.py in fit(self=<__main__.mkl_regressor instance>, X=array([[ 1.,  2.,  3.],
       [ 1.,  2.,  0.],
       [ 0.,  2.,  3.],
       [ 1.,  2.,  3.]]), y=array([[ 2.],
       [ 1.],
       [ 3.],
       [ 2.]]), **params={})
     24         self.feats_train = RealFeatures(X.T)
     25         labels_train = RegressionLabels(y.reshape((len(y), )))
     26         self._kernels_  = CombinedKernel()
     27         for width in self.widths:
     28             kernel = GaussianKernel()
---> 29             kernel.set_width(width)
        kernel.set_width = <built-in method set_width of GaussianKernel object>
        width = <scipy.stats._continuous_distns.expon_gen object>
     30             kernel.init(self.feats_train,self.feats_train)
     31             self._kernels_.append_kernel(kernel)
     32             del kernel
     33 

TypeError: in method 'GaussianKernel_set_width', argument 2 of type 'float64_t'

我不想强制估算器执行每个分发生成器,因为在这种情况下,randomizedSearchCV将无法控制使用的值。

一些建议?谢谢。

2 个答案:

答案 0 :(得分:1)

RandomizedSearchCV可以使用参数值列表来尝试,也可以使用带有rvs方法的分发对象进行采样。如果您将列表传递给它,它将假设您传递了一组离散的参数值来进行采样。它不支持单个参数的分发列表。如果现有的发行版不符合您的需求,请进行自定义发布。

如果需要返回数组的分布,只需创建一个具有rvs()方法的类,以返回随机样本并传递该实例而不是单变量分布列表。

答案 1 :(得分:1)

解决方案@bpachev建议为我工作。分发类:

class expon_vector(stats.rv_continuous):

    def __init__(self, loc = 1.0, scale = 50.0, min_size=2, max_size = 10):
        self.loc = loc
        self.scale = scale
        self.min_size = min_size
        self.max_size = max_size
        self.size = max_size - min_size # Only for initialization

    def rvs(self):

        self.size = randint.rvs(low = self.min_size, 
                                high = self.max_size, size = 1)
        return expon.rvs(loc  = self.loc, scale = self.scale, size = self.size)

我使用的自定义估算工具参数字典中包含哪些内容:

param_grid = [ {'svm_c': expon(scale=100, loc=5),
                    'mkl_c': expon(scale=100, loc=5),
                    'degree': sp_randint(0, 24),
                    'widths': expon_vector(loc = 0.1, scale = 100.0, 
                              min_size = 2, max_size = 10) } ]