两个名称列的字符串匹配准确性百分比

时间:2020-07-22 06:54:58

标签: google-bigquery user-defined-functions

我有两个表中的两个列name和ename,我想查找这些列的字符串匹配概率百分比,因为它们都包含员工姓名。 例如。如果值是

name                ename
-----              -------- 
Kim Gutierrez   Kimberly Gutierrez 

则概率百分比约为0.3,因为该百分比值将介于0和1之间。 现在,我在表中找到了大约50万个名称的百分比。

我尝试了此查询,但未获得所需的输出。

CREATE TEMPORARY FUNCTION similarity(name STRING, ename STRING)
  RETURNS FLOAT64
  LANGUAGE js AS """
    var _extend = function(dst) {
      var sources = Array.prototype.slice.call(arguments, 1);
      for (var i=0; i<sources.length; ++i) {
        var src = sources[i];
        for (var p in src) {
          if (src.hasOwnProperty(p)) dst[p] = src[p];
        }
      }
      return dst;
    };

    var Levenshtein = {
      get: function(str1, str2) {
        // base cases
        if (str1 === str2) return 0;
        if (str1.length === 0) return str2.length;
        if (str2.length === 0) return str1.length;

        // two rows
        var prevRow  = new Array(str2.length + 1),
            curCol, nextCol, i, j, tmp;

        // initialise previous row
        for (i=0; i<prevRow.length; ++i) {
          prevRow[i] = i;
        }

        // calculate current row distance from previous row
        for (i=0; i<str1.length; ++i) {
          nextCol = i + 1;

          for (j=0; j<str2.length; ++j) {
            curCol = nextCol;

            // substution
            nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
            // insertion
            tmp = curCol + 1;
            if (nextCol > tmp) {
              nextCol = tmp;
            }
            // deletion
            tmp = prevRow[j + 1] + 1;
            if (nextCol > tmp) {
              nextCol = tmp;
            }

            // copy current col value into previous (in preparation for next iteration)
            prevRow[j] = curCol;
          }

          // copy last col value into previous (in preparation for next iteration)
          prevRow[j] = nextCol;
        }

        return nextCol;
      }

    };

    var the_Name1;

    try {
      the_Name1 = decodeURI(name).toLowerCase();
    } catch (ex) {
      the_Name1 = Name1.toLowerCase();
    }

    try {
      the_Name2 = decodeURI(ename).toLowerCase();
    } catch (ex) {
      the_Name2 = Name2.toLowerCase();
    }

    return 1 - Levenshtein.get(the_Name1, the_Name2) / the_Name1.length;

  """;
SELECT 
  t1.name,
  t2.ename,
  similarity(t1.name, t2.ename) similarity_percentge,
FROM ratings t1 
JOIN reviews t2 on t1.location_id=t2.location_id

0 个答案:

没有答案