我正在尝试运行模拟以测试随机之间的平均值Levenshtein distance 二进制字符串。
我的程序是在python中,但我正在使用这个C extension。大部分时间相关的函数计算两个字符串之间的Levenshtein距离,这就是这个。
lev_edit_distance(size_t len1, const lev_byte *string1,
size_t len2, const lev_byte *string2,
int xcost)
{
size_t i;
size_t *row; /* we only need to keep one row of costs */
size_t *end;
size_t half;
/* strip common prefix */
while (len1 > 0 && len2 > 0 && *string1 == *string2) {
len1--;
len2--;
string1++;
string2++;
}
/* strip common suffix */
while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) {
len1--;
len2--;
}
/* catch trivial cases */
if (len1 == 0)
return len2;
if (len2 == 0)
return len1;
/* make the inner cycle (i.e. string2) the longer one */
if (len1 > len2) {
size_t nx = len1;
const lev_byte *sx = string1;
len1 = len2;
len2 = nx;
string1 = string2;
string2 = sx;
}
/* check len1 == 1 separately */
if (len1 == 1) {
if (xcost)
return len2 + 1 - 2*(memchr(string2, *string1, len2) != NULL);
else
return len2 - (memchr(string2, *string1, len2) != NULL);
}
len1++;
len2++;
half = len1 >> 1;
/* initalize first row */
row = (size_t*)malloc(len2*sizeof(size_t));
if (!row)
return (size_t)(-1);
end = row + len2 - 1;
for (i = 0; i < len2 - (xcost ? 0 : half); i++)
row[i] = i;
/* go through the matrix and compute the costs. yes, this is an extremely
* obfuscated version, but also extremely memory-conservative and relatively
* fast. */
if (xcost) {
for (i = 1; i < len1; i++) {
size_t *p = row + 1;
const lev_byte char1 = string1[i - 1];
const lev_byte *char2p = string2;
size_t D = i;
size_t x = i;
while (p <= end) {
if (char1 == *(char2p++))
x = --D;
else
x++;
D = *p;
D++;
if (x > D)
x = D;
*(p++) = x;
}
}
}
else {
/* in this case we don't have to scan two corner triangles (of size len1/2)
* in the matrix because no best path can go throught them. note this
* breaks when len1 == len2 == 2 so the memchr() special case above is
* necessary */
row[0] = len1 - half - 1;
for (i = 1; i < len1; i++) {
size_t *p;
const lev_byte char1 = string1[i - 1];
const lev_byte *char2p;
size_t D, x;
/* skip the upper triangle */
if (i >= len1 - half) {
size_t offset = i - (len1 - half);
size_t c3;
char2p = string2 + offset;
p = row + offset;
c3 = *(p++) + (char1 != *(char2p++));
x = *p;
x++;
D = x;
if (x > c3)
x = c3;
*(p++) = x;
}
else {
p = row + 1;
char2p = string2;
D = x = i;
}
/* skip the lower triangle */
if (i <= half + 1)
end = row + len2 + i - half - 2;
/* main */
while (p <= end) {
size_t c3 = --D + (char1 != *(char2p++));
x++;
if (x > c3)
x = c3;
D = *p;
D++;
if (x > D)
x = D;
*(p++) = x;
}
/* lower triangle sentinel */
if (i <= half) {
size_t c3 = --D + (char1 != *char2p);
x++;
if (x > c3)
x = c3;
*p = x;
}
}
}
i = *end;
free(row);
return i;
}
这可以加快吗?
我将在AMD FX(tm)-8350八核处理器上运行32位ubuntu代码。
这是调用它的python代码。
from Levenshtein import distance
import random
for i in xrange(16):
sum = 0
for j in xrange(1000):
str1 = bin(random.getrandbits(2**i))[2:].zfill(2**i)
str2 = bin(random.getrandbits(2**i))[2:].zfill(2**i)
sum += distance(str1,str2)
print i,sum/(1000*2**i)
答案 0 :(得分:3)
你可以运行这个并行。在开始时生成一个巨大的random列表,然后在你的循环中,一次生成线程(8个线程)到每个进程列表的一个块并将其最终结果添加到sum变量。或者一次生成8个列表,一次生成8个。
openmp建议的问题是“由于大量的数据依赖性,这种算法并行化很差” - 维基百科
from threading import Thread
sum = 0
def calc_distance(offset) :
sum += distance(randoms[offset][0], randoms[offset][1]) #use whatever addressing scheme is best
threads = []
for i in xrange(8) :
t = new Thread(target=calc_distance, args=(i))
t.start()
threads.append(t)
...后
for t in threads :
t.join()
我认为如果levenshtein距离内核可用(或可编码),这种方法可以很好地移植到opencl。
这只是内存中的一个快速帖子,因此可能会有一些问题需要解决。
答案 1 :(得分:1)
您可以做的是从本网站学习一些OpenMP概念和指令:A beginner's Primer to OpenMP
您需要一个兼容OpenMP的编译器。以下是work的编译器列表。编译代码时,您需要使用-fopenmp
选项。
我只在代码中添加了编译器指令#pragma omp parallel for
,告诉编译器可以并行运行以下代码块。通过将while循环更改为for循环,或者通过在整个函数中应用OpenMP模式,可以看到性能的额外增益。您可以通过在这些块之前使用函数omp_set_num_threads()
调整用于执行for循环的线程数来调整性能。对于你来说,一个很好的数字是8,因为你将在8核处理器上运行。
lev_edit_distance(size_t len1, const lev_byte *string1,
size_t len2, const lev_byte *string2,
int xcost)
{
size_t i;
size_t *row; /* we only need to keep one row of costs */
size_t *end;
size_t half;
// Set the number of threads the OpenMP framework will use to parallelize the for loops
omp_set_num_threads(8);
/* strip common prefix */
while (len1 > 0 && len2 > 0 && *string1 == *string2) {
len1--;
len2--;
string1++;
string2++;
}
/* strip common suffix */
while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) {
len1--;
len2--;
}
/* catch trivial cases */
if (len1 == 0)
return len2;
if (len2 == 0)
return len1;
/* make the inner cycle (i.e. string2) the longer one */
if (len1 > len2) {
size_t nx = len1;
const lev_byte *sx = string1;
len1 = len2;
len2 = nx;
string1 = string2;
string2 = sx;
}
/* check len1 == 1 separately */
if (len1 == 1) {
if (xcost)
return len2 + 1 - 2*(memchr(string2, *string1, len2) != NULL);
else
return len2 - (memchr(string2, *string1, len2) != NULL);
}
len1++;
len2++;
half = len1 >> 1;
/* initalize first row */
row = (size_t*)malloc(len2*sizeof(size_t));
if (!row)
return (size_t)(-1);
end = row + len2 - 1;
#pragma omp parallel for
for (i = 0; i < len2 - (xcost ? 0 : half); i++)
row[i] = i;
/* go through the matrix and compute the costs. yes, this is an extremely
* obfuscated version, but also extremely memory-conservative and relatively
* fast. */
if (xcost) {
#pragma omp parallel for
for (i = 1; i < len1; i++) {
size_t *p = row + 1;
const lev_byte char1 = string1[i - 1];
const lev_byte *char2p = string2;
size_t D = i;
size_t x = i;
while (p <= end) {
if (char1 == *(char2p++))
x = --D;
else
x++;
D = *p;
D++;
if (x > D)
x = D;
*(p++) = x;
}
}
}
else {
/* in this case we don't have to scan two corner triangles (of size len1/2)
* in the matrix because no best path can go throught them. note this
* breaks when len1 == len2 == 2 so the memchr() special case above is
* necessary */
row[0] = len1 - half - 1;
#pragma omp parallel for
for (i = 1; i < len1; i++) {
size_t *p;
const lev_byte char1 = string1[i - 1];
const lev_byte *char2p;
size_t D, x;
/* skip the upper triangle */
if (i >= len1 - half) {
size_t offset = i - (len1 - half);
size_t c3;
char2p = string2 + offset;
p = row + offset;
c3 = *(p++) + (char1 != *(char2p++));
x = *p;
x++;
D = x;
if (x > c3)
x = c3;
*(p++) = x;
}
else {
p = row + 1;
char2p = string2;
D = x = i;
}
/* skip the lower triangle */
if (i <= half + 1)
end = row + len2 + i - half - 2;
/* main */
while (p <= end) {
size_t c3 = --D + (char1 != *(char2p++));
x++;
if (x > c3)
x = c3;
D = *p;
D++;
if (x > D)
x = D;
*(p++) = x;
}
/* lower triangle sentinel */
if (i <= half) {
size_t c3 = --D + (char1 != *char2p);
x++;
if (x > c3)
x = c3;
*p = x;
}
}
}
i = *end;
free(row);
return i;
}
您也可以对for循环中正在操作的变量执行reduction运算,以便提供简单的并行计算,如求和,乘法等。
int main()
{
int i = 0,
j = 0,
sum = 0;
char str1[30]; // Change size to fit your specifications
char str2[30];
#pragma omp parallel for
for(i=0;i<16;i++)
{
sum = 0;
// Could do a reduction on sum across all threads
for(j=0;j<1000;j++)
{
// Calls will have to be changed
// I don't know much Python so I'll leave that to the experts
str1 = bin(random.getrandbits(2**i))[2:].zfill(2**i)
str2 = bin(random.getrandbits(2**i))[2:].zfill(2**i)
sum += distance(str1,str2)
}
printf("%d %d",i,(sum/(1000*2*i)));
}
}
答案 2 :(得分:1)
我要做什么:
1)非常小的优化:一次性分配row
以避免内存管理开销。或者您可以尝试realloc()
,或者您可以在静态变量中跟踪row
的大小(并且还有row
静态)。然而,这节省的很少,即使它的成本很低。
2)您正在尝试计算平均值。也用C进行平均计算。这应该在通话中保存一些东西。再次,小改变,但它便宜。
3)由于您对实际计算不感兴趣,但只对结果感兴趣,因此,假设您有三台PC,并且每台都是四核机器。然后在每个四个实例的程序上运行,循环次数十二次。你会在十二分之一的时间里得到十二个结果:平均那些,鲍勃是你的叔叔。
除了循环之外,选项#3根本不需要修改,您可能希望将其设置为命令行参数,以便可以在可变数量的计算机上部署程序。实际上,您可能希望输出结果及其“权重”,以便在将结果汇总时将错误发生的可能性降至最低。
for j in xrange(N):
str1 = bin(random.getrandbits(2**i))[2:].zfill(2**i)
str2 = bin(random.getrandbits(2**i))[2:].zfill(2**i)
sum += distance(str1,str2)
print N,i,sum/(N*2**i)
但如果您对泛型 Levenshtein统计信息感兴趣,我不太确定仅使用0和1符号进行计算是否适合您的目的。从字符串01010101中,您可以通过翻转八个字符或删除第一个字符并在结尾添加零来获得10101010,但需要两个不同的成本。如果您拥有字母表中的所有字母,则第二种可能性变得不太可能,并且应该在平均成本方案中改变某些内容。或者我错过了什么?
答案 3 :(得分:0)
其他人在一两年前做了大量的研究,并进行了运行时测试。
他想出了this并且基本上使用了解决方案树来加快速度。