我有两个表中的两个列name和ename,我想查找这些列的字符串匹配概率百分比,因为它们都包含员工姓名。 例如。如果值是
name ename
----- --------
Kim Gutierrez Kimberly Gutierrez
则概率百分比约为0.3,因为该百分比值将介于0和1之间。 现在,我在表中找到了大约50万个名称的百分比。
我尝试了此查询,但未获得所需的输出。
CREATE TEMPORARY FUNCTION similarity(name STRING, ename STRING)
RETURNS FLOAT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_Name1;
try {
the_Name1 = decodeURI(name).toLowerCase();
} catch (ex) {
the_Name1 = Name1.toLowerCase();
}
try {
the_Name2 = decodeURI(ename).toLowerCase();
} catch (ex) {
the_Name2 = Name2.toLowerCase();
}
return 1 - Levenshtein.get(the_Name1, the_Name2) / the_Name1.length;
""";
SELECT
t1.name,
t2.ename,
similarity(t1.name, t2.ename) similarity_percentge,
FROM ratings t1
JOIN reviews t2 on t1.location_id=t2.location_id