SQL(大查询)文本相似度

时间:2018-04-19 14:43:41

标签: google-bigquery bigdata user-defined-functions text-mining

我想做的事情如下: 我可能在列中获取数据,也许只有一个字符串,其格式如下:

> 420-xyz-somefancytext-12.3.2018-etc...
> 4-20-xyz-somefancytext-12.3.2018-etc...
> 4-250-xyz-somefancyothertext-13.3.2018-etc...
> 4-230-xyz-somefancyothertext-14.3.2018-etc...

用例想要检测前两行。因为第一个数字和文字非常相似,当然还有日期。我想到的是测量这种相似性的编辑或余弦距离。

我还在BigQuery中实现了一个非常简单的UDF:

 CREATE TEMPORARY FUNCTION similariry(Name1 STRING, Name2 STRING)
  RETURNS FLOAT64
  LANGUAGE js AS """
    var _extend = function(dst) {
      var sources = Array.prototype.slice.call(arguments, 1);
      for (var i=0; i<sources.length; ++i) {
        var src = sources[i];
        for (var p in src) {
          if (src.hasOwnProperty(p)) dst[p] = src[p];
        }
      }
      return dst;
    };

    var Levenshtein = {
      /**
       * Calculate levenshtein distance of the two strings.
       *
       * @param str1 String the first string.
       * @param str2 String the second string.
       * @return Integer the levenshtein distance (0 and above).
       */
      get: function(str1, str2) {
        // base cases
        if (str1 === str2) return 0;
        if (str1.length === 0) return str2.length;
        if (str2.length === 0) return str1.length;

        // two rows
        var prevRow  = new Array(str2.length + 1),
            curCol, nextCol, i, j, tmp;

        // initialise previous row
        for (i=0; i<prevRow.length; ++i) {
          prevRow[i] = i;
        }

        // calculate current row distance from previous row
        for (i=0; i<str1.length; ++i) {
          nextCol = i + 1;

          for (j=0; j<str2.length; ++j) {
            curCol = nextCol;

            // substution
            nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
            // insertion
            tmp = curCol + 1;
            if (nextCol > tmp) {
              nextCol = tmp;
            }
            // deletion
            tmp = prevRow[j + 1] + 1;
            if (nextCol > tmp) {
              nextCol = tmp;
            }

            // copy current col value into previous (in preparation for next iteration)
            prevRow[j] = curCol;
          }

          // copy last col value into previous (in preparation for next iteration)
          prevRow[j] = nextCol;
        }

        return nextCol;
      }

    };

    var the_Name1;

    try {
      the_Name1 = decodeURI(Name1).toLowerCase();
    } catch (ex) {
      the_Name1 = Name1.toLowerCase();
    }

    try {
      the_Name2 = decodeURI(Name2).toLowerCase();
    } catch (ex) {
      the_Name2 = Name2.toLowerCase();
    }

    return 1 - Levenshtein.get(the_Name1, the_Name2) / the_Name1.length;

  """;

  WITH strings AS (
  SELECT NULL          string1, NULL        string2 UNION ALL
    SELECT 'test'       string1, NULL        string2 UNION ALL
    SELECT NULL          string1, 'test'        string2 UNION ALL
    SELECT 'CRATE'      string1, 'TRACE'        string2 UNION ALL
    SELECT 'MARTHA'     string1, 'MARHTA'     string2 UNION ALL
    SELECT 'DWAYNE'     string1, 'DUANE'        string2 UNION ALL
    SELECT 'DIXON'      string1, 'DICKSONX'   string2 UNION ALL
    SELECT 'Dunningham' string1, 'Cunningham' string2 UNION ALL
    SELECT 'Abroms'     string1, 'Abrams'     string2 UNION ALL
    SELECT 'Lampley'    string1, 'Campley'    string2 UNION ALL
    SELECT 'Jonathon'   string1, 'Jonathan'   string2 UNION ALL
    SELECT 'Jeraldine'  string1, 'Gerladine'  string2 UNION ALL
    SELECT 'test'       string1, 'blank'        string2 UNION ALL
    SELECT 'everybody'  string1, 'every'        string2 UNION ALL
    SELECT 'a'          string1, 'aaa'        string2 UNION ALL
    SELECT 'Géraldine'  string1, 'Gerladine'  string2 UNION ALL
    SELECT 'Jérôme'     string1, 'Jerome'     string2 UNION ALL
    SELECT 'ça'          string1, 'ca'        string2 UNION ALL
    SELECT 'Üwe'          string1, 'Uwe'        string2 
  )
  SELECT string1, string2, similariry(string1, string2) my_sim
  FROM   strings
  ORDER  BY my_sim DESC

它测量两列的相似性。但我需要的是一种测量行相似性的算法。所以这意味着我必须检查每一行的每一行。我不知道该怎么做以及如何以最有效的方式。最后,应生成具有高相似性的行的表。

1 个答案:

答案 0 :(得分:1)

Quick outline
Step 1 - concatenate all columns values in the table into one column

For example, below rows

SELECT 'Abroms' string1, 'Abrams' string2 UNION ALL   
SELECT 'Lampley'    string1, 'Campley'    string2 

should become :

SELECT 'AbromsAbrams' cols UNION ALL
SELECT 'LampleyCampley'  

The logic of value concatenation can differ from above - but this is just to demonstrate approach

Step 2 - cross join table and apply whatever similarity function you want, so now you treat the whole row as a one column and obviously compare it with the rest of the rows

Details:
Assumptions made (for simplicity sake) : no repeated fields and no structs - just primitive data types

and I will be using below CTE for strings table,

WITH strings AS (
  SELECT NULL          string1, NULL        string2 UNION ALL
  SELECT 'test'       string1, NULL        string2 UNION ALL
  SELECT NULL          string1, 'test'        string2 UNION ALL
  SELECT 'CRATE'      string1, 'TRACE'        string2 UNION ALL
  SELECT 'MARTHA'     string1, 'MARHTA'     string2 UNION ALL
  SELECT 'DWAYNE'     string1, 'DUANE'        string2 UNION ALL
  SELECT 'DIXON'      string1, 'DICKSONX'   string2 UNION ALL
  SELECT 'Dunningham' string1, 'Cunningham' string2 UNION ALL
  SELECT 'Abroms'     string1, 'Abrams'     string2 UNION ALL
  SELECT 'Lampley'    string1, 'Campley'    string2 UNION ALL
  SELECT 'Jonathon'   string1, 'Jonathan'   string2 UNION ALL
  SELECT 'Jeraldine'  string1, 'Gerladine'  string2 UNION ALL
  SELECT 'test'       string1, 'blank'        string2 UNION ALL
  SELECT 'everybody'  string1, 'every'        string2 UNION ALL
  SELECT 'a'          string1, 'aaa'        string2 UNION ALL
  SELECT 'Géraldine'  string1, 'Gerladine'  string2 UNION ALL
  SELECT 'Jérôme'     string1, 'Jerome'     string2 UNION ALL
  SELECT 'ça'          string1, 'ca'        string2 UNION ALL
  SELECT 'Üwe'          string1, 'Uwe'        string2 
)

so will omit it from rest of the code

Step 1A - build CTE to extract all columns names and concatenate them such that then we can use them to purge from resulted column

#standardSQL
WITH columns AS (
  SELECT STRING_AGG(CONCAT('"', col, '":'), '|') cols FROM (
    SELECT 
      REPLACE(SPLIT(pair, '":')[OFFSET(0)], '"', '') col
    FROM (
      SELECT SPLIT(REGEXP_REPLACE(TO_JSON_STRING(t), r'[{}]', ''), ',"') pairs
      FROM strings t
      LIMIT 1
    ), UNNEST(pairs) pair
  )
)
SELECT *
FROM columns

result is

Row cols     
1   "string1":|"string2":   

we will need this shortly

Step 1B - Let's transform original table into table with just one column which consists of all values in the row

#standardSQL
CREATE TEMPORARY FUNCTION concatenate_row(row STRING, columns STRING) AS ((
    REGEXP_REPLACE(REGEXP_REPLACE(row, columns, ''), '{"|"}|","', '')
)); 
WITH columns AS (
  SELECT STRING_AGG(CONCAT('"', col, '":'), '|') cols FROM (
    SELECT 
      REPLACE(SPLIT(pair, '":')[OFFSET(0)], '"', '') col
    FROM (
      SELECT SPLIT(REGEXP_REPLACE(TO_JSON_STRING(t), r'[{}]', ''), ',"') pairs
      FROM strings t
      LIMIT 1
    ), UNNEST(pairs) pair
  )
), lines AS (
  SELECT 
    TO_JSON_STRING(t) original_row,
    concatenate_row(TO_JSON_STRING(t),  cols) pure_values
  FROM strings t
  CROSS JOIN columns
)
SELECT *
FROM lines

with result (just showing few rows ... )

Row original_row                                    pure_values  
1   {"string1":"Dunningham","string2":"Cunningham"} DunninghamCunningham
2   {"string1":"Jeraldine","string2":"Gerladine"}   JeraldineGerladine
3   {"string1":"Géraldine","string2":"Gerladine"}   GéraldineGerladine
4   {"string1":"Jonathon","string2":"Jonathan"}     JonathonJonathan
5   {"string1":"everybody","string2":"every"}       everybodyevery

Finally, Step 2 - CROSS JOIN and calculating similarity

#standardSQL
SELECT 
  similarity(s1.pure_values, s2.pure_values) my_sim,
  s1.pure_values s1,
  s2.pure_values s2
FROM lines s1
CROSS JOIN lines s2
WHERE s1.pure_values < s2.pure_values
ORDER BY my_sim DESC

with result (just show few rows ...)

Row my_sim              s1                  s2   
1   0.8888888888888888  GéraldineGerladine  JeraldineGerladine   
2   0.5454545454545454  test",null}         {null,null}  
3   0.5454545454545454  {null,"test         {null,null}  
4   0.5                 aaaa                çaca     
5   0.36363636363636365 test",null}         testblank    
6   0.36363636363636365 DWAYNEDUANE         ÜweUwe   
7   0.33333333333333337 JeraldineGerladine  JérômeJerome     
. . . 

Note: this is just possible direction for yo to go and if chosen - plenty of room for improvements, polishing, etc.

So, if put all together - below is what you get:

#standardSQL
CREATE TEMPORARY FUNCTION concatenate_row(row STRING, columns STRING) AS (
  (
    REGEXP_REPLACE(REGEXP_REPLACE(row, columns, ''), '{"|"}|","', '')
  )
); 

 CREATE TEMPORARY FUNCTION similarity(Name1 STRING, Name2 STRING)
  RETURNS FLOAT64
  LANGUAGE js AS """
    var _extend = function(dst) {
      var sources = Array.prototype.slice.call(arguments, 1);
      for (var i=0; i<sources.length; ++i) {
        var src = sources[i];
        for (var p in src) {
          if (src.hasOwnProperty(p)) dst[p] = src[p];
        }
      }
      return dst;
    };

    var Levenshtein = {
      /**
       * Calculate levenshtein distance of the two strings.
       *
       * @param str1 String the first string.
       * @param str2 String the second string.
       * @return Integer the levenshtein distance (0 and above).
       */
      get: function(str1, str2) {
        // base cases
        if (str1 === str2) return 0;
        if (str1.length === 0) return str2.length;
        if (str2.length === 0) return str1.length;

        // two rows
        var prevRow  = new Array(str2.length + 1),
            curCol, nextCol, i, j, tmp;

        // initialise previous row
        for (i=0; i<prevRow.length; ++i) {
          prevRow[i] = i;
        }

        // calculate current row distance from previous row
        for (i=0; i<str1.length; ++i) {
          nextCol = i + 1;

          for (j=0; j<str2.length; ++j) {
            curCol = nextCol;

            // substution
            nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
            // insertion
            tmp = curCol + 1;
            if (nextCol > tmp) {
              nextCol = tmp;
            }
            // deletion
            tmp = prevRow[j + 1] + 1;
            if (nextCol > tmp) {
              nextCol = tmp;
            }

            // copy current col value into previous (in preparation for next iteration)
            prevRow[j] = curCol;
          }

          // copy last col value into previous (in preparation for next iteration)
          prevRow[j] = nextCol;
        }

        return nextCol;
      }

    };

    var the_Name1;

    try {
      the_Name1 = decodeURI(Name1).toLowerCase();
    } catch (ex) {
      the_Name1 = Name1.toLowerCase();
    }

    try {
      the_Name2 = decodeURI(Name2).toLowerCase();
    } catch (ex) {
      the_Name2 = Name2.toLowerCase();
    }

    return 1 - Levenshtein.get(the_Name1, the_Name2) / the_Name1.length;
  """;

  WITH strings AS (
    SELECT NULL          string1, NULL        string2 UNION ALL
    SELECT 'test'       string1, NULL        string2 UNION ALL
    SELECT NULL          string1, 'test'        string2 UNION ALL
    SELECT 'CRATE'      string1, 'TRACE'        string2 UNION ALL
    SELECT 'MARTHA'     string1, 'MARHTA'     string2 UNION ALL
    SELECT 'DWAYNE'     string1, 'DUANE'        string2 UNION ALL
    SELECT 'DIXON'      string1, 'DICKSONX'   string2 UNION ALL
    SELECT 'Dunningham' string1, 'Cunningham' string2 UNION ALL
    SELECT 'Abroms'     string1, 'Abrams'     string2 UNION ALL
    SELECT 'Lampley'    string1, 'Campley'    string2 UNION ALL
    SELECT 'Jonathon'   string1, 'Jonathan'   string2 UNION ALL
    SELECT 'Jeraldine'  string1, 'Gerladine'  string2 UNION ALL
    SELECT 'test'       string1, 'blank'        string2 UNION ALL
    SELECT 'everybody'  string1, 'every'        string2 UNION ALL
    SELECT 'a'          string1, 'aaa'        string2 UNION ALL
    SELECT 'Géraldine'  string1, 'Gerladine'  string2 UNION ALL
    SELECT 'Jérôme'     string1, 'Jerome'     string2 UNION ALL
    SELECT 'ça'          string1, 'ca'        string2 UNION ALL
    SELECT 'Üwe'          string1, 'Uwe'        string2 
  ), columns AS (
    SELECT STRING_AGG(CONCAT('"', col, '":'), '|') cols FROM (
      SELECT 
        REPLACE(SPLIT(pair, '":')[OFFSET(0)], '"', '') col
      FROM (
        SELECT SPLIT(REGEXP_REPLACE(TO_JSON_STRING(t), r'[{}]', ''), ',"') pairs
        FROM strings t
        LIMIT 1
      ), UNNEST(pairs) pair
    )
  ), lines AS (
    SELECT 
      TO_JSON_STRING(t) original_row,
      concatenate_row(TO_JSON_STRING(t),  cols) pure_values
    FROM strings t
    CROSS JOIN columns
  )
SELECT 
  similarity(s1.pure_values, s2.pure_values) my_sim,
  s1.pure_values s1,
  s2.pure_values s2
FROM lines s1
CROSS JOIN lines s2
WHERE s1.pure_values < s2.pure_values
ORDER  BY my_sim DESC