答案 0 :(得分:1)
尝试以下
#standardSQL
CREATE TEMPORARY FUNCTION similarity(Text1 STRING, Text2 STRING)
RETURNS FLOAT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_Text1;
try {the_Text1 = decodeURI(Text1).toLowerCase();} catch (ex) {the_Text1 = Text1.toLowerCase();}
try {the_Text2 = decodeURI(Text2).toLowerCase();} catch (ex) {the_Text2 = Text2.toLowerCase();}
return Levenshtein.get(the_Text1, the_Text2) / the_Text1.length;
""";
SELECT *, (
SELECT t1.Item_description
FROM `project.dataset.tab1` t1
ORDER BY similarity(t2.Item_description, REPLACE(t1.Item_description, '|', ', '))
LIMIT 1
) matched_description
FROM `project.dataset.tab2` t2
如果要应用于问题的样本数据-结果将为
Row Customer_ld Item_description matched_description
1 1001 Item Lenovo x1 Yoga, i7 14" is delivered Lenovo x1 Yoga|i7 14"
2 1002 Lenovo x1 Yoga, i5 13" is delivered to customer Lenovo x1 Yoga|i5 13"
3 1003 Lenovo Yoga, i7 14" is delivered to customer@1003 Lenovo Yoga|i7 14"
4 1004 Item lenovo x1 yoga, i7 14" is delivered successfully Lenovo x1 Yoga|i7 14"
5 1005 Item Lenovo x1 Yoga, i7 14" is delivered@1005 Lenovo x1 Yoga|i7 14"
答案 1 :(得分:0)
我将使用正则表达式对使每个描述唯一的功能进行标记化。
with Tab1x as (
select
Item_description,
ifnull(regexp_extract(Item_description,r'([x][0-9])'),'none') as xspec,
ifnull(regexp_extract(Item_description,r'([i][0-9])'), 'none') as ispec,
ifnull(regexp_extract(Item_description,r'([0-9]{2}\")'), 'none') as size
from Tab1
),
Tab2x as (
select
Customer_id,
Item_description,
ifnull(regexp_extract(Item_description,r'([x][0-9])'),'none') as xspec,
ifnull(regexp_extract(Item_description,r'([i][0-9])'), 'none') as ispec,
ifnull(regexp_extract(Item_description,r'([0-9]{2}\")'), 'none') as size
from Tab2
)
select
Tab1x.Item_description as Tab1_Item_description,
Tab2x.Item_description as Tab2_Item_description,
Tab2x.Customer_id
from Tab1x
left join Tab2x using(xspec,ispec,size)
请注意,我没有碰过Lenovo
或Yoga
,但是如果您的真实数据集具有多个品牌/型号,则需要以类似的方式来处理。