google-bigquery UDF for jaro_winkle_distance

时间:2016-10-05 23:46:10

标签: google-bigquery

我是以下计算jaro_winkle_distance的UDF代码 它似乎在使用json测试数据进行测试时有效,但是当我尝试在google-bigquery UI中调用它时,它始终保持得分为零。 即使是自我加入,例如

输入:

[
 {a: "Liu",b:"Lau"},
 {a: "John",b:"Jone"}]

输出:

[
  {
    "scr": 80
  },
  {
    "scr": 87
  }
]

SQL:

CREATE TEMP FUNCTION
  jwd(a STRING,
    b STRING)
  RETURNS INT64
  LANGUAGE js AS """
    // Assumes 'doInterestingStuff' is defined in one of the library files.
    //return doInterestingStuff(a, b);
    return jaro_winkler_distance(a,b);
""" OPTIONS ( library="gs://kayama808/javascript/jaro_winkler_google_UDF.js" );
SELECT
  x.name name1,
  jwd(x.name,
    x.name) scr
FROM
   babynames.usa_1910_2013_copy x
WHERE
  x.gender = 'F' and x.number >= 1000 and x.state = 'CA'
ORDER BY
  scr DESC;

http://storage.googleapis.com/bigquery-udf-test-tool/testtool.html

https://storage.cloud.google.com/kayama808/javascript/jaro_winkler_google_UDF.js?_ga=1.184402278.1320598031.1475534357

1 个答案:

答案 0 :(得分:3)

请尝试以下操作。它按预期工作,结果为

name1   name2   scr  
Liu     Liu     100  
John    Jone     87  
Liu     Lau      80  

希望您能够将其放回外部lib文件:o)

CREATE TEMP FUNCTION jwd(a STRING, b STRING)
  RETURNS INT64
  LANGUAGE js AS """
/* JS implementation of the strcmp95 C function written by
Bill Winkler, George McLaughlin, Matt Jaro and Maureen Lynch,
released in 1994 (http://web.archive.org/web/20100227020019/http://www.census.gov/geo/msb/stand/strcmp.c).

a and b should be strings. Always performs case-insensitive comparisons
and always adjusts for long strings. */

var jaro_winkler_adjustments = {
  'A': 'E',
  'A': 'I',
  'A': 'O',
  'A': 'U',
  'B': 'V',
  'E': 'I',
  'E': 'O',
  'E': 'U',
  'I': 'O',
  'I': 'U',
  'O': 'U',
  'I': 'Y',
  'E': 'Y',
  'C': 'G',
  'E': 'F',
  'W': 'U',
  'W': 'V',
  'X': 'K',
  'S': 'Z',
  'X': 'S',
  'Q': 'C',
  'U': 'V',
  'M': 'N',
  'L': 'I',
  'Q': 'O',
  'P': 'R',
  'I': 'J',
  '2': 'Z',
  '5': 'S',
  '8': 'B',
  '1': 'I',
  '1': 'L',
  '0': 'O',
  '0': 'Q',
  'C': 'K',
  'G': 'J',
  'E': ' ',
  'Y': ' ', 
  'S': ' '
};

  if (!a || !b) { return 0.0; }

  a = a.trim().toUpperCase();
  b = b.trim().toUpperCase();
  var a_len = a.length;
  var b_len = b.length;
  var a_flag = []; var b_flag = [];
  var search_range = Math.floor(Math.max(a_len, b_len) / 2) - 1;
  var minv = Math.min(a_len, b_len);

  // Looking only within the search range, count and flag the matched pairs. 
  var Num_com = 0;
  var yl1 = b_len - 1;
  for (var i = 0; i < a_len; i++) {
    var lowlim = (i >= search_range) ? i - search_range : 0;
    var hilim  = ((i + search_range) <= yl1) ? (i + search_range) : yl1;
    for (var j = lowlim; j <= hilim; j++) {
      if (b_flag[j] !== 1 && a[j] === b[i]) {
        a_flag[j] = 1;
        b_flag[i] = 1;
        Num_com++;
        break;
      }
    }
  }

  // Return if no characters in common
  if (Num_com === 0) { return 0.0; }

  // Count the number of transpositions
  var k = 0; var N_trans = 0;
  for (var i = 0; i < a_len; i++) {
    if (a_flag[i] === 1) {
      var j;
      for (j = k; j < b_len; j++) {
        if (b_flag[j] === 1) {
          k = j + 1;
          break;
        }
      }
      if (a[i] !== b[j]) { N_trans++; }
    }
  }
  N_trans = Math.floor(N_trans / 2);

  // Adjust for similarities in nonmatched characters
  var N_simi = 0; var adjwt = jaro_winkler_adjustments;
  if (minv > Num_com) {
    for (var i = 0; i < a_len; i++) {
      if (!a_flag[i]) {
        for (var j = 0; j < b_len; j++) {
          if (!b_flag[j]) {
            if (adjwt[a[i]] === b[j]) {
              N_simi += 3;
              b_flag[j] = 2;
              break;
            }
          }
        }
      }
    }
  }

  var Num_sim = (N_simi / 10.0) + Num_com;

  // Main weight computation
  var weight = Num_sim / a_len + Num_sim / b_len + (Num_com - N_trans) / Num_com;
  weight = weight / 3;

  // Continue to boost the weight if the strings are similar
  if (weight > 0.7) {
    // Adjust for having up to the first 4 characters in common
    var j = (minv >= 4) ? 4 : minv;
    var i;
    for (i = 0; (i < j) && a[i] === b[i]; i++) { }
    if (i) { weight += i * 0.1 * (1.0 - weight) };

    // Adjust for long strings.
    // After agreeing beginning chars, at least two more must agree
    // and the agreeing characters must be more than half of the
    // remaining characters.
    if (minv > 4 && Num_com > i + 1 && 2 * Num_com >= minv + i) {
      weight += (1 - weight) * ((Num_com - i - 1) / (a_len * b_len - i*2 + 2));
    }
  }

    return Math.round(weight*100);

""";

SELECT
  name1, name2,
  jwd(name1, name2) scr
FROM -- babynames.usa_1910_2013_copy x
  (
  select "Liu" as name1, "Lau" as name2 union all
  select "Liu" as name1, "Liu" as name2 union all
  select "John" as name1, "Jone" as name2 
  ) x
ORDER BY scr DESC

此外:我们只是仔细检查了您的jaro_winkler_google_UDF2.js文件,并清楚地看到该问题存在于此文件中。
使用我的答案中的代码修复此文件

或者,只需删除其中的以下行

var a = r.a;
var b = r.b;

并取消注释

//jaro_winkler.distance = function(a, b) {

//return Math.round(weight*100)

并使用emit对其进行评论

jaro_winkler_distance=function(r, emit) {

emit(weight);

那你应该没问题!