我是以下计算jaro_winkle_distance的UDF代码 它似乎在使用json测试数据进行测试时有效,但是当我尝试在google-bigquery UI中调用它时,它始终保持得分为零。 即使是自我加入,例如
输入:
[
{a: "Liu",b:"Lau"},
{a: "John",b:"Jone"}]
输出:
[
{
"scr": 80
},
{
"scr": 87
}
]
SQL:
CREATE TEMP FUNCTION
jwd(a STRING,
b STRING)
RETURNS INT64
LANGUAGE js AS """
// Assumes 'doInterestingStuff' is defined in one of the library files.
//return doInterestingStuff(a, b);
return jaro_winkler_distance(a,b);
""" OPTIONS ( library="gs://kayama808/javascript/jaro_winkler_google_UDF.js" );
SELECT
x.name name1,
jwd(x.name,
x.name) scr
FROM
babynames.usa_1910_2013_copy x
WHERE
x.gender = 'F' and x.number >= 1000 and x.state = 'CA'
ORDER BY
scr DESC;
http://storage.googleapis.com/bigquery-udf-test-tool/testtool.html
答案 0 :(得分:3)
请尝试以下操作。它按预期工作,结果为
name1 name2 scr
Liu Liu 100
John Jone 87
Liu Lau 80
希望您能够将其放回外部lib文件:o)
CREATE TEMP FUNCTION jwd(a STRING, b STRING)
RETURNS INT64
LANGUAGE js AS """
/* JS implementation of the strcmp95 C function written by
Bill Winkler, George McLaughlin, Matt Jaro and Maureen Lynch,
released in 1994 (http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).
a and b should be strings. Always performs case-insensitive comparisons
and always adjusts for long strings. */
var jaro_winkler_adjustments = {
'A': 'E',
'A': 'I',
'A': 'O',
'A': 'U',
'B': 'V',
'E': 'I',
'E': 'O',
'E': 'U',
'I': 'O',
'I': 'U',
'O': 'U',
'I': 'Y',
'E': 'Y',
'C': 'G',
'E': 'F',
'W': 'U',
'W': 'V',
'X': 'K',
'S': 'Z',
'X': 'S',
'Q': 'C',
'U': 'V',
'M': 'N',
'L': 'I',
'Q': 'O',
'P': 'R',
'I': 'J',
'2': 'Z',
'5': 'S',
'8': 'B',
'1': 'I',
'1': 'L',
'0': 'O',
'0': 'Q',
'C': 'K',
'G': 'J',
'E': ' ',
'Y': ' ',
'S': ' '
};
if (!a || !b) { return 0.0; }
a = a.trim().toUpperCase();
b = b.trim().toUpperCase();
var a_len = a.length;
var b_len = b.length;
var a_flag = []; var b_flag = [];
var search_range = Math.floor(Math.max(a_len, b_len) / 2) - 1;
var minv = Math.min(a_len, b_len);
// Looking only within the search range, count and flag the matched pairs.
var Num_com = 0;
var yl1 = b_len - 1;
for (var i = 0; i < a_len; i++) {
var lowlim = (i >= search_range) ? i - search_range : 0;
var hilim = ((i + search_range) <= yl1) ? (i + search_range) : yl1;
for (var j = lowlim; j <= hilim; j++) {
if (b_flag[j] !== 1 && a[j] === b[i]) {
a_flag[j] = 1;
b_flag[i] = 1;
Num_com++;
break;
}
}
}
// Return if no characters in common
if (Num_com === 0) { return 0.0; }
// Count the number of transpositions
var k = 0; var N_trans = 0;
for (var i = 0; i < a_len; i++) {
if (a_flag[i] === 1) {
var j;
for (j = k; j < b_len; j++) {
if (b_flag[j] === 1) {
k = j + 1;
break;
}
}
if (a[i] !== b[j]) { N_trans++; }
}
}
N_trans = Math.floor(N_trans / 2);
// Adjust for similarities in nonmatched characters
var N_simi = 0; var adjwt = jaro_winkler_adjustments;
if (minv > Num_com) {
for (var i = 0; i < a_len; i++) {
if (!a_flag[i]) {
for (var j = 0; j < b_len; j++) {
if (!b_flag[j]) {
if (adjwt[a[i]] === b[j]) {
N_simi += 3;
b_flag[j] = 2;
break;
}
}
}
}
}
}
var Num_sim = (N_simi / 10.0) + Num_com;
// Main weight computation
var weight = Num_sim / a_len + Num_sim / b_len + (Num_com - N_trans) / Num_com;
weight = weight / 3;
// Continue to boost the weight if the strings are similar
if (weight > 0.7) {
// Adjust for having up to the first 4 characters in common
var j = (minv >= 4) ? 4 : minv;
var i;
for (i = 0; (i < j) && a[i] === b[i]; i++) { }
if (i) { weight += i * 0.1 * (1.0 - weight) };
// Adjust for long strings.
// After agreeing beginning chars, at least two more must agree
// and the agreeing characters must be more than half of the
// remaining characters.
if (minv > 4 && Num_com > i + 1 && 2 * Num_com >= minv + i) {
weight += (1 - weight) * ((Num_com - i - 1) / (a_len * b_len - i*2 + 2));
}
}
return Math.round(weight*100);
""";
SELECT
name1, name2,
jwd(name1, name2) scr
FROM -- babynames.usa_1910_2013_copy x
(
select "Liu" as name1, "Lau" as name2 union all
select "Liu" as name1, "Liu" as name2 union all
select "John" as name1, "Jone" as name2
) x
ORDER BY scr DESC
此外:我们只是仔细检查了您的jaro_winkler_google_UDF2.js文件,并清楚地看到该问题存在于此文件中。
使用我的答案中的代码修复此文件
或者,只需删除其中的以下行
var a = r.a;
var b = r.b;
并取消注释
//jaro_winkler.distance = function(a, b) {
//return Math.round(weight*100)
并使用emit
对其进行评论
jaro_winkler_distance=function(r, emit) {
emit(weight);
那你应该没问题!