在使用STARTS_WITH
之类的字符串函数或LIKE
中的诸如LEFT JOIN ON
之类的运算符时,我遇到麻烦了,其中的一个参数都来自于加入。我没有尝试解释摘要,而是提取了一个小示例...
让我们考虑一个名为 fuzzylog 的表,该表具有一个关键字段fullname
,我想通过联接到具有相同列的 names 表来对其进行规范化。 Fuzzylog中的关键字段可能有点混乱或自由形式,因此不可能直接进行等式联接。这些表可能看起来像这样:
fuzzylog
表:
names
表:
names表试图通过提供一个contains
字段来容纳模糊性,如果完全匹配失败,我想使用LIKE
加入该字段:
#standardSQL
SELECT l.id, n.fullname, n.nameid,
l.fullname AS logged_fullname
FROM `neilotemp.fuzzylog` l
LEFT JOIN `neilotemp.names` n
ON l.fullname = n.fullname
OR l.fullname LIKE CONCAT('%', n.contains, '%')
不幸的是,最后一行(我真正需要的那一行)会导致错误:如果没有连接两端的字段相等的条件,则不能使用LEFT OUTER JOIN。我要解决的问题。
我已经进行了很多搜索,但是找不到任何可以使它点击的内容。我必须在正确的道路上迈出一步。
答案 0 :(得分:2)
您可以将其用于数组和相关子查询:
with fuzzylog as (
select 1 as id, 'John Smith' as fullname union all
select 2 as id, 'Jane Doe' UNION ALL
select 6 as id, 'J. Smith'
),
names as (
select 1 as nameid, 'John Smith' as fullname, 'smith' as word
)
select l.id, l.fullname, n.fullname as name_fullname, n.nameid
from (SELECT l.*,
(SELECT array_agg(n.nameid)
from names n
where l.fullname = n.fullname OR lower(l.fullname) LIKE CONCAT('%', lower(n.word), '%')
) nameids
FROM fuzzylog l
) l LEFT JOIN
unnest(l.nameids) the_nameid left join
names n
on n.nameid = the_nameid;
答案 1 :(得分:2)
老实说,我认为在此处使用contains
字段不是最好的主意
而是考虑使用Levenshtein [edit]距离
#standardSQL
CREATE TEMPORARY FUNCTION EDIT_DISTANCE(string1 STRING, string2 STRING)
RETURNS INT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_string1;
try {
the_string1 = decodeURI(string1).toLowerCase();
} catch (ex) {
the_string1 = string1.toLowerCase();
}
try {
the_string2 = decodeURI(string2).toLowerCase();
} catch (ex) {
the_string2 = string2.toLowerCase();
}
return Levenshtein.get(the_string1, the_string2)
""";
WITH notrmalized_fuzzylog as (
select id, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.fuzzylog`
), normalized_names as (
select nameid, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.names`
)
select
id, l.fullname AS logged_fullname,
ARRAY_AGG(
STRUCT(n.nameid, n.fullname)
ORDER BY EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
)[OFFSET(0)].*
FROM notrmalized_fuzzylog l
CROSS JOIN normalized_names n
GROUP BY 1, 2
您可以使用下面的问题中的虚拟数据进行测试,操作
#standardSQL
CREATE TEMPORARY FUNCTION EDIT_DISTANCE(string1 STRING, string2 STRING)
RETURNS INT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_string1;
try {
the_string1 = decodeURI(string1).toLowerCase();
} catch (ex) {
the_string1 = string1.toLowerCase();
}
try {
the_string2 = decodeURI(string2).toLowerCase();
} catch (ex) {
the_string2 = string2.toLowerCase();
}
return Levenshtein.get(the_string1, the_string2)
""";
WITH `project.dataset.fuzzylog` AS (
SELECT 1 id, 'John Smith' fullname UNION ALL
SELECT 2, 'Jane Doe' UNION ALL
SELECT 3, 'Ms. Jane Doe' UNION ALL
SELECT 4, 'Mr. John Smith' UNION ALL
SELECT 5, 'Smith, John' UNION ALL
SELECT 6, 'J.Smith' UNION ALL
SELECT 7, 'J. Doe'
), `project.dataset.names` AS (
SELECT 1 nameid, 'John Smith' fullname, 'smith' match UNION ALL
SELECT 2, 'Jane Doe', 'doe'
), notrmalized_fuzzylog as (
select id, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.fuzzylog`
), normalized_names as (
select nameid, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.names`
)
select
id, l.fullname AS logged_fullname,
ARRAY_AGG(
STRUCT(n.nameid, n.fullname)
ORDER BY EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
)[OFFSET(0)].*
FROM notrmalized_fuzzylog l
CROSS JOIN normalized_names n
GROUP BY 1, 2
-- ORDER BY 1
结果:
Row id logged_fullname nameid fullname
1 1 John Smith 1 John Smith
2 2 Jane Doe 2 Jane Doe
3 3 Ms. Jane Doe 2 Jane Doe
4 4 Mr. John Smith 1 John Smith
5 5 Smith, John 1 John Smith
6 6 J.Smith 1 John Smith
7 7 J. Doe 2 Jane Doe
正如您在此解决方案中看到的那样,我们完全忽略/删除了任何额外的人工列(例如contains
),而是应用Levenshtein距离直接测量两个全名之间的相似性。如您所见,在执行此操作之前,我们对全名进行了重新排序/规范化以对其部分进行排序
如果这种方法对您有用-您应该考虑通过先删除/替换所有标点(例如点,逗号等)和空格来改善效果的重新排序