我在python中实现了一个smo算法。因为我只是作为一种练习,所以我没有使用一些科学计算库,如numpy和scipy。我只是希望它的工作正确。但是当我在diabetes上测试我的代码时,它会持续运行一周!我已多次检查我的代码,我也发现了一些错误。但在我纠正这些错误后,代码仍然运行得太慢。我不知道是否有一些我没有检查过的错误,或者smo本身就是如此慢。
那么是否存在一些可能导致代码运行缓慢的常见错误?我编写了我的程序,引用了smo paper的伪代码 非常感谢。
以下是我的代码。
#encoding=utf8
import math
import random
class SVM(object):
def __init__(self, dataset, target, C=0.001, tolerance=0.001):
self.dataset=dataset
self.target=target
self.C=C
self.tolerance=tolerance
self.alpha=[0.0 for i in range(len(dataset))]
self.E={}
self.b=0.0
self.w=[0.0 for i in range(len(dataset[0]))]
def train(self):
numChanged=0
exampleAll=1
trainset_size=len(self.dataset)
iter=0
while numChanged > 0 or exampleAll:
numChanged=0
if exampleAll:
for i in range(trainset_size):
numChanged+=self.examineExample(i)
iter+=1
else:
for i in range(trainset_size):
if self.alpha[i] > 0 and self.alpha[i] < self.C:
numChanged+=self.examineExample(i)
iter+=1
if exampleAll:
exampleAll=0
elif numChanged == 0:
exampleAll=1
print "iter", iter
print "alpha", "\t".join([str(i) for i in self.alpha])
print "target", "\t".join(self.target)
for j in range(len(self.trainset[0])):
for i in range(trainset_size):
self.w[j] +=self.alpha[i]*int(self.target[i])*float(self.dataset[i][j])
def examineExample(self, i2):
print "in examineExample", i2
print "alpha", "\t".join([str(i) for i in self.alpha])
alpha2=self.alpha[i2]
y2=int(self.target[i2])
e2=self.calculateE(i2)
r2=e2*y2
print "r2", r2
if r2 < -self.tolerance and self.alpha[i2] < self.C or r2 > self.tolerance and self.alpha[i2] > 0: #i2违反了kkt条件
i1=self.select_i1(i2,e2)
if self.takeStep(i1, i2):
return 1
else:
all_sample_index=[i for i in range(len(self.dataset)) ]
random.shuffle(all_sample_index)
for k in range(len(all_sample_index)):
i1=all_sample_index[k]
if self.alpha[i1] > 0 and self.alpha[i1] < self.C:
if self.takeStep(i1, i2):
return 1
random.shuffle(all_sample_index)
for k in range(len(all_sample_index)):
i1=all_sample_index[k]
if self.takeStep(i1,i2):
return 1
return 0
def takeStep(self, i1, i2):
print "in takeStep", i1, i2
if i1==i2:
return 0
alpha1=self.alpha[i1]
y1=int(self.target[i1])
e1=self.calculateE(i1)
alpha2=self.alpha[i2]
y2=int(self.target[i2])
e2=self.calculateE(i2)
s=y1*y2
if y1 != y2:
L=max(0, alpha2-alpha1)
H=min(self.C, self.C+alpha2-alpha1)
if y1== y2:
L=max(0, alpha2+alpha1-self.C)
H=min(self.C, alpha2+alpha1)
if L==H:
return 0
k11=self.kernel(i1, i1)
k12=self.kernel(i1, i2)
k22=self.kernel(i2, i2)
eta=k11+k22-2*k12
if eta > 0:
self.alpha[i2]=alpha2+y2*(e1-e2)/eta
if self.alpha[i2] < L:
self.alpha[i2]=L
if self.alpha[i2] >H:
self.alpha[i2]=H
print "abs", abs(self.alpha[i2] - alpha2)
if abs(self.alpha[i2] - alpha2) < 0.00001
return 0
self.alpha[i1]=alpha1+s*(alpha2-self.alpha[i2])
b1=self.b-e1-y1*(self.alpha[i1]-alpha1)*self.kernel(i1,i1)-y2*(self.alpha[i2]-alpha2)*self.kernel(i1,i2)
b2=self.b-e2-y1*(self.alpha[i1]-alpha1)*self.kernel(i1,i2)-y2*(self.alpha[i2]-alpha2)*self.kernel(i2,i2)
print "two old alpha", alpha1, alpha2
print "two alpha", self.alpha[i1] ,self.alpha[i2]
if self.alpha[i1] >0 and self.alpha[i1] < self.C and self.alpha[i2] > 0 and self.alpha[i2] < self.C:
print "two b", b1, b2
if self.alpha[i1] >0 and self.alpha[i1] < self.C:
self.b=b1
elif self.alpha[i2] > 0 and self.alpha[i2] < self.C:
self.b=b2
else:
self.b=(b1+b2)/2
self.E[i2]=self.calculateE(i2)
self.E[i1]=self.calculateE(i1)
return 1
else:
return 0
def select_i1(self, i, Ei ):
maxK=-1;
maxDeltaE=0.0
Ej=0
self.E[i]=Ei
for k in range(len(self.dataset)):
if self.alpha[k] > 0 and self.alpha[k] < self.C:
Ek=self.calculateE(k)
deltaE=Ek-Ei
if abs(deltaE) > maxDeltaE:
maxK=k
maxDeltaE=deltaE
Ej=Ek
if maxK != -1:
return maxK
else:
j=i
while j == i:
j=random.randint(0, len(self.dataset))
return j
def calculateE(self, i):
f_x=0.0
trainset_size=len(self.dataset)
for k in range(trainset_size):
f_x+=(self.alpha[k]*int(self.target[k])*self.kernel(k,i))
f_x+=self.b
e_x=f_x-float(self.target[i])
return e_x
def kernel(self, i, j):
return sum([float(self.dataset[i][k])*float(self.dataset[j][k]) for k in range(len(self.dataset[i]))])
def test(self, testset, testset_target):
precision=0.0
correct=0
for k in range(len(testset)):
sample =testset[k]
pred_value=0.0
for i in range(len(sample)):
pred_value+=self.w[i]*sample[i]
pred_value+=self.b
if pred_value >= 0:
label=1
else:
label=-1
if testset_target[k] == label:
correct+=1
precision=correct/(float(len(testset_target)))
return precision
def read_libsvm_format_file(dataset_filename):
dataset_file=file(dataset_filename,'r')
dataset_label=[]
dataset=[]
for line in dataset_file:
splitted=line.strip().split()
dataset_label.append(splitted[0])
sample=[]
for i in range(1,len(splitted)):
index_value=splitted[i].split(":")
sample.append(index_value[1])
dataset.append(sample)
return dataset, dataset_label
if __name__ == "__main__":
dataset, target =read_libsvm_format_file('diabetes')
trainset_size=500
index=range(len(dataset))
random.shuffle(index)
trainset=[ dataset[index[i]] for i in range(trainset_size) ]
trainset_target=[ target[index[i]] for i in range(trainset_size) ]
testset=[ dataset[index[i]] for i in range(trainset_size, len(index)) ]
testset_target=[ target[index[i]] for i in range(trainset_size, len(index)) ]
svm=SVM(dataset, target)
svm.train()
答案 0 :(得分:3)
不幸的是,科学python编程中导致代码运行缓慢的最常见错误是......使用纯python。 Python循环很慢,周期。 And by slow I mean extremely slow.即使假设eveything完全正确,除非您执行以下操作之一,否则最终会使用极慢的优化程序:
之后没有
:
if abs(self.alpha[i2] - alpha2) < 0.00001
所以它甚至都不会运行。
接下来,在修复它并在diabetes
上运行后,它会崩溃
r2 -0.999256460902
in takeStep 658 2
Traceback (most recent call last):
File "a.py", line 218, in <module>
svm.train()
File "a.py", line 26, in train
numChanged+=self.examineExample(i)
File "a.py", line 55, in examineExample
if self.takeStep(i1, i2):
File "a.py", line 79, in takeStep
e1=self.calculateE(i1)
File "a.py", line 160, in calculateE
f_x+=(self.alpha[k]*int(self.target[k])*self.kernel(k,i))
File "a.py", line 167, in kernel
return sum([float(self.dataset[i][k])*float(self.dataset[j][k]) for k in range(len(self.dataset[i]))])
IndexError: list index out of range
由无效阅读功能引起。 libsvm(svmlight)dataformat 稀疏,因此可能缺少某些维度 - 您的代码假定它没有。
您甚至将数据读作字符串
index_value=splitted[i].split(":")
sample.append(index_value[1])
应该是(在您预先分配样本列表后,它们足够大以适应数据,或使用默认值sample = defaultdict(lambda: 0)
)。
index_value=splitted[i].split(":")
sample[int(index_value[0])-1] = float(index_value[1])
同样适用于阅读标签。因此,您的代码中有许多完全冗余的类型转换(您当前的所有float()
和int()
调用都是多余的。)
在w
的最终构建中也存在错误:
[...] self.trainset [...] // you do not have a "trainset" field in SVM
在测试代码中,您多次添加拦截(b
)
for i in range(len(sample)):
pred_value+=self.w[i] * sample[i]
pred_value+=self.b
虽然它应该是
for i in range(len(sample)):
pred_value+=self.w[i] * sample[i]
pred_value+=self.b
如果在SMO本身也会发现许多错误,这可能会导致算法根本不收敛,我不会感到惊讶,但是现在我只能设法找到上述内容。
修复上述所有内容后,删除所有调试打印消息并使用pypy
运行我得到以下模型:
[ - 0.7725132490683443,-2.8232379861128907,0.5166865781499452, 0.1494369704938019,0.1533317981122747,-1.9500615428909012,-0.7957828887451327,-0.12523832631571777]
而scikit-learn给出了
[0.77296251 2.82387247 -0.51692311 -0.14987696 -0.15312237 1.94999242 0.79593224 0.12527931]
所以签署它是相同的模型。
使用C=1
时,两个代码的最终结果都是训练集精确度0.776041666667
真实的9m11.017s
pypy
真正的0m47.033s
pypy
真实0m40.215s
真实的0m0.338s
您的大多数错误似乎都位于数据读取实用程序中。此外,如开头所述 - &#34;经典&#34; python解释器具有极慢的循环,因此您必须使用pypy
(并且缺少对许多库的支持),或cython
(以及更复杂的开发)或至少数值库,例如{{1 }和numpy
。