识别数据库中的类似字段(但不是重复字段)

时间:2016-07-28 18:11:25

标签: sql google-bigquery

我正在处理一个我正努力解决的查询问题。我有一个名字数据库。我要做的是弄清楚数据库中有多个名称与同一ID相关联的人,这些名称彼此非常相似:

ID                          Name
-------------               ----------
123ABC                      Joe Smith

123ABC                      Joseph Smith

345XYZ                      Michael Johnson

345XYZ                      MikeJohnson

678LMN                      Suzyjones

678LMN                      Suzanne Mary Jones

因此,我希望构建一个可以识别这些人的查询。有人有任何建议或意见吗?显然,它可能非常棘手,因为我们不会处理直接的重复,而是小而微妙的变化。

3 个答案:

答案 0 :(得分:0)

在ID匹配且名称不匹配的情况下进行自我加入:

select t1.ID, t1.NAME, t2.NAME
from your_table t1
join your_table t2
  on t1.ID = t2.ID
 and t1.NAME <> t2.NAME

答案 1 :(得分:0)

你可以通过多种方式实现这一目标,我建议你仔细阅读group by条款。

  

以下查询假设您的表中只有记录   如果ID附有名称。

;WITH CTE AS
(
SELECT ID 
FROM <yourTable>
group by ID 
HAVING COUNT(1) > 1
)
SELECT T.* 
FROM CTE C
JOIN <yourTable> T
ON C.id - T.ID

如果同一个表中有多个具有相同名称的行,那么您只需要预先应用distinct子句。

答案 2 :(得分:0)

检查以下内容 - 应该适合你 在查询结尾处注意WHERE similarity > -1 - 通过设置值而不是-1,您可以控制相似度阈值。越接近1,您想捕获的对越相似。接近0 - 更多对捕获!

SELECT ID, Name1, Name2, similarity FROM 
JS( // input table
(
  SELECT one.ID AS ID, one.Name AS Name1, two.Name AS Name2
  FROM YourTable AS one
  JOIN YourTable AS two ON one.ID = two.ID
  HAVING Name1 < Name2
) ,
// input columns
ID, Name1, Name2,
// output schema
"[{name: 'ID', type:'string'},
  {name: 'Name1', type:'string'},
  {name: 'Name2', type:'string'},
  {name: 'similarity', type:'float'}]
",
// function
"function(r, emit) {

  var _extend = function(dst) {
    var sources = Array.prototype.slice.call(arguments, 1);
    for (var i=0; i<sources.length; ++i) {
      var src = sources[i];
      for (var p in src) {
        if (src.hasOwnProperty(p)) dst[p] = src[p];
      }
    }
    return dst;
  };

  var Levenshtein = {
    /**
     * Calculate levenshtein distance of the two strings.
     *
     * @param str1 String the first string.
     * @param str2 String the second string.
     * @return Integer the levenshtein distance (0 and above).
     */
    get: function(str1, str2) {
      // base cases
      if (str1 === str2) return 0;
      if (str1.length === 0) return str2.length;
      if (str2.length === 0) return str1.length;

      // two rows
      var prevRow  = new Array(str2.length + 1),
          curCol, nextCol, i, j, tmp;

      // initialise previous row
      for (i=0; i<prevRow.length; ++i) {
        prevRow[i] = i;
      }

      // calculate current row distance from previous row
      for (i=0; i<str1.length; ++i) {
        nextCol = i + 1;

        for (j=0; j<str2.length; ++j) {
          curCol = nextCol;

          // substution
          nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
          // insertion
          tmp = curCol + 1;
          if (nextCol > tmp) {
            nextCol = tmp;
          }
          // deletion
          tmp = prevRow[j + 1] + 1;
          if (nextCol > tmp) {
            nextCol = tmp;
          }

          // copy current col value into previous (in preparation for next iteration)
          prevRow[j] = curCol;
        }

        // copy last col value into previous (in preparation for next iteration)
        prevRow[j] = nextCol;
      }

      return nextCol;
    }

  };

  var the_Name1;

  try {
    the_Name1 = decodeURI(r.Name1).toLowerCase();
  } catch (ex) {
    the_Name1 = r.Name1.toLowerCase();
  }

  try {
    the_Name2 = decodeURI(r.Name2).toLowerCase();
  } catch (ex) {
    the_Name2 = r.Name2.toLowerCase();
  }

  emit({ID: r.ID, Name1: the_Name1, Name2: the_Name2,
        similarity: 1 - Levenshtein.get(the_Name1, the_Name2) / the_Name1.length});

  }"
)
WHERE similarity > -1
ORDER BY similarity DESC 

您可以使用以下示例进行测试

SELECT ID, Name1, Name2, similarity FROM 
JS( // input table
(
  SELECT one.ID AS ID, one.Name AS Name1, two.Name AS Name2
  FROM (
    SELECT ID, Name FROM
      (SELECT '123ABC' AS ID, 'Joe Smith' AS Name),
      (SELECT '123ABC' AS ID, 'Joseph Smith' AS Name),
      (SELECT '345XYZ' AS ID, 'Michael Johnson' AS Name),
      (SELECT '345XYZ' AS ID, 'MikeJohnson' AS Name),
      (SELECT '678LMN' AS ID, 'Suzyjones' AS Name),
      (SELECT '678LMN' AS ID, 'Suzanne Mary Jones' AS Name),
      (SELECT 'AAA' AS ID, 'Jordan Tigani' AS Name),
      (SELECT 'AAA' AS ID, 'Felipe Hoffa' AS Name),
      (SELECT 'BBB' AS ID, 'Mikhail Berlyant' AS Name),
      (SELECT 'BBB' AS ID, 'Michael Sheldon' AS Name),
  ) AS one
  JOIN (
    SELECT ID, Name FROM
      (SELECT '123ABC' AS ID, 'Joe Smith' AS Name),
      (SELECT '123ABC' AS ID, 'Joseph Smith' AS Name),
      (SELECT '345XYZ' AS ID, 'Michael Johnson' AS Name),
      (SELECT '345XYZ' AS ID, 'MikeJohnson' AS Name),
      (SELECT '678LMN' AS ID, 'Suzyjones' AS Name),
      (SELECT '678LMN' AS ID, 'Suzanne Mary Jones' AS Name),
      (SELECT 'AAA' AS ID, 'Jordan Tigani' AS Name),
      (SELECT 'AAA' AS ID, 'Felipe Hoffa' AS Name),
      (SELECT 'BBB' AS ID, 'Mikhail Berlyant' AS Name),
      (SELECT 'BBB' AS ID, 'Michael Sheldon' AS Name),
  ) AS two
  ON one.ID = two.ID
  HAVING Name1 < Name2
) ,
// input columns
ID, Name1, Name2,
// output schema
"[{name: 'ID', type:'string'},
  {name: 'Name1', type:'string'},
  {name: 'Name2', type:'string'},
  {name: 'similarity', type:'float'}]
",
// function
"function(r, emit) {

  var _extend = function(dst) {
    var sources = Array.prototype.slice.call(arguments, 1);
    for (var i=0; i<sources.length; ++i) {
      var src = sources[i];
      for (var p in src) {
        if (src.hasOwnProperty(p)) dst[p] = src[p];
      }
    }
    return dst;
  };

  var Levenshtein = {
    /**
     * Calculate levenshtein distance of the two strings.
     *
     * @param str1 String the first string.
     * @param str2 String the second string.
     * @return Integer the levenshtein distance (0 and above).
     */
    get: function(str1, str2) {
      // base cases
      if (str1 === str2) return 0;
      if (str1.length === 0) return str2.length;
      if (str2.length === 0) return str1.length;

      // two rows
      var prevRow  = new Array(str2.length + 1),
          curCol, nextCol, i, j, tmp;

      // initialise previous row
      for (i=0; i<prevRow.length; ++i) {
        prevRow[i] = i;
      }

      // calculate current row distance from previous row
      for (i=0; i<str1.length; ++i) {
        nextCol = i + 1;

        for (j=0; j<str2.length; ++j) {
          curCol = nextCol;

          // substution
          nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
          // insertion
          tmp = curCol + 1;
          if (nextCol > tmp) {
            nextCol = tmp;
          }
          // deletion
          tmp = prevRow[j + 1] + 1;
          if (nextCol > tmp) {
            nextCol = tmp;
          }

          // copy current col value into previous (in preparation for next iteration)
          prevRow[j] = curCol;
        }

        // copy last col value into previous (in preparation for next iteration)
        prevRow[j] = nextCol;
      }

      return nextCol;
    }

  };

  var the_Name1;

  try {
    the_Name1 = decodeURI(r.Name1).toLowerCase();
  } catch (ex) {
    the_Name1 = r.Name1.toLowerCase();
  }

  try {
    the_Name2 = decodeURI(r.Name2).toLowerCase();
  } catch (ex) {
    the_Name2 = r.Name2.toLowerCase();
  }

  emit({ID: r.ID, Name1: the_Name1, Name2: the_Name2,
        similarity: 1 - Levenshtein.get(the_Name1, the_Name2) / the_Name1.length});

  }"
)
WHERE similarity > -1
ORDER BY similarity DESC

它产生以下结果

ID          Name1               Name2               similarity   
123ABC      joe smith           joseph smith        0.6666666666666667   
345XYZ      michael johnson     mikejohnson         0.6666666666666667   
678LMN      suzanne mary jones  suzyjones           0.5  
BBB         michael sheldon     mikhail berlyant    0.4666666666666667   
AAA         felipe hoffa        jordan tigani       0.0