我想做的事情如下: 我可能在列中获取数据,也许只有一个字符串,其格式如下:
> 420-xyz-somefancytext-12.3.2018-etc...
> 4-20-xyz-somefancytext-12.3.2018-etc...
> 4-250-xyz-somefancyothertext-13.3.2018-etc...
> 4-230-xyz-somefancyothertext-14.3.2018-etc...
用例想要检测前两行。因为第一个数字和文字非常相似,当然还有日期。我想到的是测量这种相似性的编辑或余弦距离。
我还在BigQuery中实现了一个非常简单的UDF:
CREATE TEMPORARY FUNCTION similariry(Name1 STRING, Name2 STRING)
RETURNS FLOAT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_Name1;
try {
the_Name1 = decodeURI(Name1).toLowerCase();
} catch (ex) {
the_Name1 = Name1.toLowerCase();
}
try {
the_Name2 = decodeURI(Name2).toLowerCase();
} catch (ex) {
the_Name2 = Name2.toLowerCase();
}
return 1 - Levenshtein.get(the_Name1, the_Name2) / the_Name1.length;
""";
WITH strings AS (
SELECT NULL string1, NULL string2 UNION ALL
SELECT 'test' string1, NULL string2 UNION ALL
SELECT NULL string1, 'test' string2 UNION ALL
SELECT 'CRATE' string1, 'TRACE' string2 UNION ALL
SELECT 'MARTHA' string1, 'MARHTA' string2 UNION ALL
SELECT 'DWAYNE' string1, 'DUANE' string2 UNION ALL
SELECT 'DIXON' string1, 'DICKSONX' string2 UNION ALL
SELECT 'Dunningham' string1, 'Cunningham' string2 UNION ALL
SELECT 'Abroms' string1, 'Abrams' string2 UNION ALL
SELECT 'Lampley' string1, 'Campley' string2 UNION ALL
SELECT 'Jonathon' string1, 'Jonathan' string2 UNION ALL
SELECT 'Jeraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'test' string1, 'blank' string2 UNION ALL
SELECT 'everybody' string1, 'every' string2 UNION ALL
SELECT 'a' string1, 'aaa' string2 UNION ALL
SELECT 'Géraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'Jérôme' string1, 'Jerome' string2 UNION ALL
SELECT 'ça' string1, 'ca' string2 UNION ALL
SELECT 'Üwe' string1, 'Uwe' string2
)
SELECT string1, string2, similariry(string1, string2) my_sim
FROM strings
ORDER BY my_sim DESC
它测量两列的相似性。但我需要的是一种测量行相似性的算法。所以这意味着我必须检查每一行的每一行。我不知道该怎么做以及如何以最有效的方式。最后,应生成具有高相似性的行的表。
答案 0 :(得分:1)
Quick outline
Step 1 - concatenate all columns values in the table into one column
For example, below rows
SELECT 'Abroms' string1, 'Abrams' string2 UNION ALL
SELECT 'Lampley' string1, 'Campley' string2
should become :
SELECT 'AbromsAbrams' cols UNION ALL
SELECT 'LampleyCampley'
The logic of value concatenation can differ from above - but this is just to demonstrate approach
Step 2 - cross join table and apply whatever similarity function you want, so now you treat the whole row as a one column and obviously compare it with the rest of the rows
Details:
Assumptions made (for simplicity sake) : no repeated fields and no structs - just primitive data types
and I will be using below CTE for strings table,
WITH strings AS (
SELECT NULL string1, NULL string2 UNION ALL
SELECT 'test' string1, NULL string2 UNION ALL
SELECT NULL string1, 'test' string2 UNION ALL
SELECT 'CRATE' string1, 'TRACE' string2 UNION ALL
SELECT 'MARTHA' string1, 'MARHTA' string2 UNION ALL
SELECT 'DWAYNE' string1, 'DUANE' string2 UNION ALL
SELECT 'DIXON' string1, 'DICKSONX' string2 UNION ALL
SELECT 'Dunningham' string1, 'Cunningham' string2 UNION ALL
SELECT 'Abroms' string1, 'Abrams' string2 UNION ALL
SELECT 'Lampley' string1, 'Campley' string2 UNION ALL
SELECT 'Jonathon' string1, 'Jonathan' string2 UNION ALL
SELECT 'Jeraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'test' string1, 'blank' string2 UNION ALL
SELECT 'everybody' string1, 'every' string2 UNION ALL
SELECT 'a' string1, 'aaa' string2 UNION ALL
SELECT 'Géraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'Jérôme' string1, 'Jerome' string2 UNION ALL
SELECT 'ça' string1, 'ca' string2 UNION ALL
SELECT 'Üwe' string1, 'Uwe' string2
)
so will omit it from rest of the code
Step 1A - build CTE to extract all columns names and concatenate them such that then we can use them to purge from resulted column
#standardSQL
WITH columns AS (
SELECT STRING_AGG(CONCAT('"', col, '":'), '|') cols FROM (
SELECT
REPLACE(SPLIT(pair, '":')[OFFSET(0)], '"', '') col
FROM (
SELECT SPLIT(REGEXP_REPLACE(TO_JSON_STRING(t), r'[{}]', ''), ',"') pairs
FROM strings t
LIMIT 1
), UNNEST(pairs) pair
)
)
SELECT *
FROM columns
result is
Row cols
1 "string1":|"string2":
we will need this shortly
Step 1B - Let's transform original table into table with just one column which consists of all values in the row
#standardSQL
CREATE TEMPORARY FUNCTION concatenate_row(row STRING, columns STRING) AS ((
REGEXP_REPLACE(REGEXP_REPLACE(row, columns, ''), '{"|"}|","', '')
));
WITH columns AS (
SELECT STRING_AGG(CONCAT('"', col, '":'), '|') cols FROM (
SELECT
REPLACE(SPLIT(pair, '":')[OFFSET(0)], '"', '') col
FROM (
SELECT SPLIT(REGEXP_REPLACE(TO_JSON_STRING(t), r'[{}]', ''), ',"') pairs
FROM strings t
LIMIT 1
), UNNEST(pairs) pair
)
), lines AS (
SELECT
TO_JSON_STRING(t) original_row,
concatenate_row(TO_JSON_STRING(t), cols) pure_values
FROM strings t
CROSS JOIN columns
)
SELECT *
FROM lines
with result (just showing few rows ... )
Row original_row pure_values
1 {"string1":"Dunningham","string2":"Cunningham"} DunninghamCunningham
2 {"string1":"Jeraldine","string2":"Gerladine"} JeraldineGerladine
3 {"string1":"Géraldine","string2":"Gerladine"} GéraldineGerladine
4 {"string1":"Jonathon","string2":"Jonathan"} JonathonJonathan
5 {"string1":"everybody","string2":"every"} everybodyevery
Finally, Step 2 - CROSS JOIN and calculating similarity
#standardSQL
SELECT
similarity(s1.pure_values, s2.pure_values) my_sim,
s1.pure_values s1,
s2.pure_values s2
FROM lines s1
CROSS JOIN lines s2
WHERE s1.pure_values < s2.pure_values
ORDER BY my_sim DESC
with result (just show few rows ...)
Row my_sim s1 s2
1 0.8888888888888888 GéraldineGerladine JeraldineGerladine
2 0.5454545454545454 test",null} {null,null}
3 0.5454545454545454 {null,"test {null,null}
4 0.5 aaaa çaca
5 0.36363636363636365 test",null} testblank
6 0.36363636363636365 DWAYNEDUANE ÜweUwe
7 0.33333333333333337 JeraldineGerladine JérômeJerome
. . .
Note: this is just possible direction for yo to go and if chosen - plenty of room for improvements, polishing, etc.
So, if put all together - below is what you get:
#standardSQL
CREATE TEMPORARY FUNCTION concatenate_row(row STRING, columns STRING) AS (
(
REGEXP_REPLACE(REGEXP_REPLACE(row, columns, ''), '{"|"}|","', '')
)
);
CREATE TEMPORARY FUNCTION similarity(Name1 STRING, Name2 STRING)
RETURNS FLOAT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_Name1;
try {
the_Name1 = decodeURI(Name1).toLowerCase();
} catch (ex) {
the_Name1 = Name1.toLowerCase();
}
try {
the_Name2 = decodeURI(Name2).toLowerCase();
} catch (ex) {
the_Name2 = Name2.toLowerCase();
}
return 1 - Levenshtein.get(the_Name1, the_Name2) / the_Name1.length;
""";
WITH strings AS (
SELECT NULL string1, NULL string2 UNION ALL
SELECT 'test' string1, NULL string2 UNION ALL
SELECT NULL string1, 'test' string2 UNION ALL
SELECT 'CRATE' string1, 'TRACE' string2 UNION ALL
SELECT 'MARTHA' string1, 'MARHTA' string2 UNION ALL
SELECT 'DWAYNE' string1, 'DUANE' string2 UNION ALL
SELECT 'DIXON' string1, 'DICKSONX' string2 UNION ALL
SELECT 'Dunningham' string1, 'Cunningham' string2 UNION ALL
SELECT 'Abroms' string1, 'Abrams' string2 UNION ALL
SELECT 'Lampley' string1, 'Campley' string2 UNION ALL
SELECT 'Jonathon' string1, 'Jonathan' string2 UNION ALL
SELECT 'Jeraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'test' string1, 'blank' string2 UNION ALL
SELECT 'everybody' string1, 'every' string2 UNION ALL
SELECT 'a' string1, 'aaa' string2 UNION ALL
SELECT 'Géraldine' string1, 'Gerladine' string2 UNION ALL
SELECT 'Jérôme' string1, 'Jerome' string2 UNION ALL
SELECT 'ça' string1, 'ca' string2 UNION ALL
SELECT 'Üwe' string1, 'Uwe' string2
), columns AS (
SELECT STRING_AGG(CONCAT('"', col, '":'), '|') cols FROM (
SELECT
REPLACE(SPLIT(pair, '":')[OFFSET(0)], '"', '') col
FROM (
SELECT SPLIT(REGEXP_REPLACE(TO_JSON_STRING(t), r'[{}]', ''), ',"') pairs
FROM strings t
LIMIT 1
), UNNEST(pairs) pair
)
), lines AS (
SELECT
TO_JSON_STRING(t) original_row,
concatenate_row(TO_JSON_STRING(t), cols) pure_values
FROM strings t
CROSS JOIN columns
)
SELECT
similarity(s1.pure_values, s2.pure_values) my_sim,
s1.pure_values s1,
s2.pure_values s2
FROM lines s1
CROSS JOIN lines s2
WHERE s1.pure_values < s2.pure_values
ORDER BY my_sim DESC