错误:在hadoop之上运行有关SVM的代码时没有名为multiarray的模块

时间:2013-05-12 15:09:12

标签: python numpy svm importerror

我正在努力通过使用hadoop并行运行SVM算法。

我发现并行SVM已经在hadoop上用Python语言实现了。

以下是我找到的代码。 它基于Pegasos算法,适用于Hadoop的适当权重更新算法

    `Created on Feb 27, 2011
MapReduce version of Pegasos SVM
Using mrjob to automate job flow
@author: Peter
'''
from mrjob.job import MRJob

import pickle
from numpy import *

class MRsvm(MRJob):
    DEFAULT_INPUT_PROTOCOL = 'json_value'

    def __init__(self, *args, **kwargs):
        super(MRsvm, self).__init__(*args, **kwargs)
        self.data = pickle.load(open('/usr/local/hadoop/Python_SVM/machinelearninginaction/Ch15/svmDat26'))
        self.w = 0
        self.eta = 0.69
        self.dataList = []
        self.k = self.options.batchsize
        self.numMappers = 1
        self.t = 1  #iteration number

    def configure_options(self):
        super(MRsvm, self).configure_options()
        self.add_passthrough_option(
            '--iterations', dest='iterations', default=2, type='int',
            help='T: number of iterations to run')
        self.add_passthrough_option(
            '--batchsize', dest='batchsize', default=100, type='int',
            help='k: number of data points in a batch')

    def map(self, mapperId, inVals): #needs exactly 2 arguments
        #input: nodeId, ('w', w-vector) OR nodeId, ('x', int)
        if False: yield
        if inVals[0]=='w':                  #accumulate W-vector
            self.w = inVals[1]
        elif inVals[0]=='x':
            self.dataList.append(inVals[1])#accumulate data points to calc
        elif inVals[0]=='t': self.t = inVals[1]
        else: self.eta=inVals #this is for debug, eta not used in map

    def map_fin(self):
        labels = self.data[:,-1]; X=self.data[:,0:-1]#reshape data into X and Y
        if self.w == 0: self.w = [0.001]*shape(X)[1] #init w on first iteration
        for index in self.dataList:
            p = mat(self.w)*X[index,:].T #calc p=w*dataSet[key].T
            if labels[index]*p < 1.0:
                yield (1, ['u', index])#make sure everything has the same key
        yield (1, ['w', self.w])       #so it ends up at the same reducer
        yield (1, ['t', self.t])

    def reduce(self, _, packedVals):
        for valArr in packedVals: #get values from streamed inputs
            if valArr[0]=='u':  self.dataList.append(valArr[1])
            elif valArr[0]=='w': self.w = valArr[1]
            elif valArr[0]=='t':  self.t = valArr[1]
        labels = self.data[:,-1]; X=self.data[:,0:-1]
        wMat = mat(self.w);   wDelta = mat(zeros(len(self.w)))
        for index in self.dataList:
            wDelta += float(labels[index])*X[index,:] #wDelta += label*dataSet
        eta = 1.0/(2.0*self.t)       #calc new: eta
        #calc new: w = (1.0 - 1/t)*w + (eta/k)*wDelta
        wMat = (1.0 - 1.0/self.t)*wMat + (eta/self.k)*wDelta

当我运行代码时,输​​出以下内容。

`File "mrSVM.py", line 78, in <module>
    MRsvm.run()
  File "/usr/lib/python2.6/site-packages/mrjob/job.py", line 482, in run
    mr_job = cls(args=_READ_ARGS_FROM_SYS_ARGV)
  File "mrSVM.py", line 17, in __init__
    self.data = pickle.load(open('/usr/local/hadoop/Python_SVM/machinelearninginaction/Ch15/svmDat26'))
  File "/usr/lib64/python2.6/pickle.py", line 1370, in load
    return Unpickler(file).load()
  File "/usr/lib64/python2.6/pickle.py", line 858, in load
    dispatch[key](self)
  File "/usr/lib64/python2.6/pickle.py", line 1090, in load_global
    klass = self.find_class(module, name)
  File "/usr/lib64/python2.6/pickle.py", line 1124, in find_class
    __import__(module)
ImportError: No module named multiarray`

起初,我认为Numpy是一个与多阵列相关的数学包,这是它的一个原因,所以我将包设置到所有从节点,但它没有解决问题。

该代码是否存在语法问题?

谢谢

0 个答案:

没有答案