我希望测试MDLD的性能,以便将某些浏览器中的字符串比较集成到Web应用程序中。用例涉及比较字符串,如“300mm,Packed Wall”和“Packed Wall - 300mm”,所以我一直在寻找模糊字符串匹配,它具有标点符号和拼写错误的容忍度,以及允许块字符转置。 / p>
我无法在线找到Javascript的实现。我在CSIRO's Taxamatch Wiki找到了为PL / SQL编写的版本。
这是我尝试将代码转换为JS;基本函数的结果似乎相当准确,但是块转置计算没有给出预期的结果。例如。 “Hi There”vs“There Hi”返回“6”,无论块限制设置为什么。
如果有人知道有效的实施,你能指点我吗?或者,我的改编或源代码本身有什么问题?我做的唯一一个重大改变是在两个实例中使用“Math.ceil()”,其中源似乎使用整数除法,这总是占据一席之地 - 这导致输入的奇怪问题导致1个字符串 - 但似乎没有影响我测试过的其他案例的行为。
function mdld(str1, str2, block_lim)
{
mycol = [];
cost = 0;
len1 = str1.length;
len2 = str2.length;
if( str1 === str2 )
return 0;
else if ( len1 === 0 || len2 === 0 )
return Math.max(len1, len2);
else if ( len1 === 1 && len2 === 1 && str1 !== str2 )
return 1;
else
{
// Temporary strings which will be pre-processed
// Speeds up calcs & retains correct measurement
temp1 = str1;
temp2 = str2;
// Trim any common initial characters
while ( temp1.substr(0,1) === temp2.substr(0,1) )
{
temp1 = temp1.substr(1, temp1.length);
temp2 = temp2.substr(1, temp2.length);
}
// Trim any trailing characters
while ( temp1.substr(-1,1) === temp2.substr(-1,1) )
{
temp1 = temp1.substr(0,temp1.length-1);
temp2 = temp2.substr(0,temp2.length-1);
}
len1 = temp1.length;
len2 = temp2.length;
// Calc Levenshtein Distance
if (len1 === 0 || len2 === 0)
return Math.max(len1, len2);
else if (len1 === 1 && len2 === 1 && str1 !== str2)
return 1;
else
{
// Create columns
var s, t;
for(s = 0; s <= len1; s++)
mycol[s] = [];
// Enter values into leftmost column
for(t = 0; t <= len2; t++)
mycol[0][t] = t;
// Populate remaining columns
for(s = 1; s <= len1; s++)
{
mycol[s][0] = s;
// Populate first row (each cell of one column)
for(t = 1; t <= len2; t++)
{
//Calculate cost
if (temp1.substr(s-1,1) === temp2.substr(t-1,1))
cost = 0;
else
cost = 1;
// extension to cover multiple character transpositions
// that includes calculation of original Levenshtein distance when no transposition
tempBlockLen = Math.min( Math.ceil(len1/2), Math.ceil(len2/2), !block_lim ? 1 : block_lim );
while (tempBlockLen >= 1)
{
if ((s >= tempBlockLen * 2) &&
(t >= tempBlockLen * 2) &&
(temp1.substr(s-tempBlockLen*2, tempBlockLen) === temp2.substr(t-tempBlockLen, tempBlockLen)) &&
(temp1.substr(s-tempBlockLen, tempBlockLen) === temp2.substr(t-tempBlockLen*2, tempBlockLen)))
{
// Transposition found
mycol[s][t] = Math.min( mycol[s][t-1] + 1,
mycol[s-1][t] + 1,
mycol[s-tempBlockLen*2][t-tempBlockLen*2] + cost + tempBlockLen - 1 );
tempBlockLen = 0;
}
else if (tempBlockLen === 1)
{
// No Transposition
mycol[s][t] = Math.min( mycol[s][t-1] + 1,
mycol[s-1][t] + 1,
mycol[s-1][t-1] + cost );
}
tempBlockLen = tempBlockLen - 1;
}
}
}
}
return mycol[len1][len2];
}
}
答案 0 :(得分:0)
最后,我无法弄清楚我对CSIRO代码的改编是什么问题。找到了一个github repo,它使用Ruby扩展https://github.com/GlobalNamesArchitecture/damerau-levenshtein在C中实现了该函数。
改编以获得功能实现。似乎工作正常,但对我的用例不太好。 MDLD可以交换文本块,但仅限于不需要多次连续交换来构造源字符串的情况。转而去看N-Grams。
对于那些感兴趣的人,这是我的最终结果。在性能方面,块数限制为5,它在大约5秒内比较了大约1000个,20-40个字符串。
li $v0,5 #take in input
syscall
#This will take the edge length of the base of the right triangle as argument $a0.
move $a0, $v0 #move the input to a0
# If user enter 0 or a negative number, the program exits.
bgtz $a0, printTriangle #branch if greater than zero
li $v0, 4
la $a0, ExitMsg
syscall
li $v0,10 #end program
syscall
# Otherwise, the program will pass the edge length value in $a0 to the printTriangle procedure, which will print the triangle as described.
printTriangle: #Write a procedure called printTriangle.
li $t0, 1 #keep track vervicle
li $t1, 1 #keep track Horizontal
move $t3, $a0 #INPUT INT as t3
# It will then print the triangle with stars.
loop:
li $v0, 4
la $a0, star
syscall
beq $t0, $t1, exitLoop #if counter 0 = counter 1, exitLoop to new line
#print an *
addi $t0, $t0, 1 #otherwise incriment t0 location in array
j loop #looping up to beginning
exitLoop:
li $v0, 4 #print newline
la $a0, newLine
syscall
beq $t1, $t3, input #if t1 = INPUT INT as t3, The mainline code will then loop back to ask the user for a new edge length.
#else
li $t0, 1 #set t0 back to 0
addi $t1, $t1, 1 # incriment t1 location in array
j loop