我正在努力通过使用hadoop并行运行SVM算法。
我发现并行SVM已经在hadoop上用Python语言实现了。
以下是我找到的代码。 它基于Pegasos算法,适用于Hadoop的适当权重更新算法
`Created on Feb 27, 2011
MapReduce version of Pegasos SVM
Using mrjob to automate job flow
@author: Peter
'''
from mrjob.job import MRJob
import pickle
from numpy import *
class MRsvm(MRJob):
DEFAULT_INPUT_PROTOCOL = 'json_value'
def __init__(self, *args, **kwargs):
super(MRsvm, self).__init__(*args, **kwargs)
self.data = pickle.load(open('/usr/local/hadoop/Python_SVM/machinelearninginaction/Ch15/svmDat26'))
self.w = 0
self.eta = 0.69
self.dataList = []
self.k = self.options.batchsize
self.numMappers = 1
self.t = 1 #iteration number
def configure_options(self):
super(MRsvm, self).configure_options()
self.add_passthrough_option(
'--iterations', dest='iterations', default=2, type='int',
help='T: number of iterations to run')
self.add_passthrough_option(
'--batchsize', dest='batchsize', default=100, type='int',
help='k: number of data points in a batch')
def map(self, mapperId, inVals): #needs exactly 2 arguments
#input: nodeId, ('w', w-vector) OR nodeId, ('x', int)
if False: yield
if inVals[0]=='w': #accumulate W-vector
self.w = inVals[1]
elif inVals[0]=='x':
self.dataList.append(inVals[1])#accumulate data points to calc
elif inVals[0]=='t': self.t = inVals[1]
else: self.eta=inVals #this is for debug, eta not used in map
def map_fin(self):
labels = self.data[:,-1]; X=self.data[:,0:-1]#reshape data into X and Y
if self.w == 0: self.w = [0.001]*shape(X)[1] #init w on first iteration
for index in self.dataList:
p = mat(self.w)*X[index,:].T #calc p=w*dataSet[key].T
if labels[index]*p < 1.0:
yield (1, ['u', index])#make sure everything has the same key
yield (1, ['w', self.w]) #so it ends up at the same reducer
yield (1, ['t', self.t])
def reduce(self, _, packedVals):
for valArr in packedVals: #get values from streamed inputs
if valArr[0]=='u': self.dataList.append(valArr[1])
elif valArr[0]=='w': self.w = valArr[1]
elif valArr[0]=='t': self.t = valArr[1]
labels = self.data[:,-1]; X=self.data[:,0:-1]
wMat = mat(self.w); wDelta = mat(zeros(len(self.w)))
for index in self.dataList:
wDelta += float(labels[index])*X[index,:] #wDelta += label*dataSet
eta = 1.0/(2.0*self.t) #calc new: eta
#calc new: w = (1.0 - 1/t)*w + (eta/k)*wDelta
wMat = (1.0 - 1.0/self.t)*wMat + (eta/self.k)*wDelta
当我运行代码时,输出以下内容。
`File "mrSVM.py", line 78, in <module>
MRsvm.run()
File "/usr/lib/python2.6/site-packages/mrjob/job.py", line 482, in run
mr_job = cls(args=_READ_ARGS_FROM_SYS_ARGV)
File "mrSVM.py", line 17, in __init__
self.data = pickle.load(open('/usr/local/hadoop/Python_SVM/machinelearninginaction/Ch15/svmDat26'))
File "/usr/lib64/python2.6/pickle.py", line 1370, in load
return Unpickler(file).load()
File "/usr/lib64/python2.6/pickle.py", line 858, in load
dispatch[key](self)
File "/usr/lib64/python2.6/pickle.py", line 1090, in load_global
klass = self.find_class(module, name)
File "/usr/lib64/python2.6/pickle.py", line 1124, in find_class
__import__(module)
ImportError: No module named multiarray`
起初,我认为Numpy是一个与多阵列相关的数学包,这是它的一个原因,所以我将包设置到所有从节点,但它没有解决问题。
该代码是否存在语法问题?
谢谢