我有一个包含数百万行的大型数据集,其中包含与他们相关联的各种人物和物品。在许多情况下,这些人民'名称也存在于项目名称中。我想找到项目名称的最短子字符串,其中所有者名称或其名称的一部分不再存在。
数据样本如下:
CREATE TABLE test ([ID] nvarchar(255), [OWNER] nvarchar(255), [ITEM] nvarchar(255))
INSERT INTO test
SELECT '1','A B C','A B X X X'
UNION ALL
SELECT '2','ABC DEF','XABCD XX X'
UNION ALL
SELECT '2','ABC DEF','YABCD X X'
UNION ALL
SELECT '3','X X X X','YPD X X'
UNION ALL
SELECT '4','XYZ','X X X'
UNION ALL
SELECT '5','A B C','OOO PPP QQQ'
理想的输出是:
ID | OWNER | ITEM | SHORT ITEM
1 | A B C | A B X X X | X X X
2 | ABC DEF | XABCD XX X | XX X
2 | ABC DEF | YABCD X X | X X
3 | X X X X | YPD X X | X X
4 | XYZ | X X X | X X X
5 | A B C | OOO PPP DDD| PPP QQQ
此输出包括一些情况,我想从项目名称中删除某些不是所有者姓名的内容,因此我将其硬编码到查询中。我写了以下问题:
;WITH p1 as( --Retrieving first word of ITEM and ITEM minus first word
SELECT SUBSTRING([ITEM],1,
case when CHARINDEX(' ',[ITEM])=0 then LEN([ITEM]) --When no space in ITEM, return ITEM
else CHARINDEX(' ', [ITEM]) -1 end) as w1p --Return the first word separated by space
,SUBSTRING([ITEM],CHARINDEX(' ',[ITEM])+1,100) as m1p --Return everything minus the first word
,[ITEM]
,[ID]
,[OWNER]
FROM test
),p2 as( --Retrieving second word of ITEM and ITEM minus second word
SELECT SUBSTRING(m1p,1,
case when CHARINDEX(' ',m1p)=0 then LEN(m1p)
else CHARINDEX(' ',m1p) -1 end) as w2p
,SUBSTRING(m1p,CHARINDEX(' ',m1p)+1,100) as m2p
,[ITEM]
,[ID]
,[w1p]
,[m1p]
FROM p1
),p3 as( --Retrieving third word of ITEM and ITEM minus third word
SELECT SUBSTRING(m2p,1,
case when CHARINDEX(' ',m2p)=0 then LEN(m2p)
else CHARINDEX(' ',m2p) -1 end) as w3p
,SUBSTRING(m2p,CHARINDEX(' ',m2p)+1,100) as m3p
,*
FROM p2
),p4 as( --Retrieving fourth word of ITEM and ITEM minus fourth word
SELECT SUBSTRING(m3p,1,
case when CHARINDEX(' ',m3p)=0 then LEN(m3p)
else CHARINDEX(' ',m3p) -1 end) as w4p
,SUBSTRING(m3p,CHARINDEX(' ',m3p)+1,100) as m4p
,*
FROM p3
),m1 as( --Retrieving first word of OWNER and OWNER minus first word
SELECT SUBSTRING([OWNER],1,
CASE WHEN CHARINDEX(' ',[OWNER])=0 THEN LEN([OWNER])
ELSE CHARINDEX(' ',[OWNER])-1 end) as w1m
,SUBSTRING([OWNER],CHARINDEX(' ',[OWNER])+1,100) as m1m
,[OWNER]
,[ID]
FROM p1
GROUP BY [OWNER], [ID]
),m2 as( --Retrieving second word of OWNER and OWNER minus second word
SELECT SUBSTRING(m1m,1,
case when CHARINDEX(' ', m1m) = 0 then LEN(m1m)
else CHARINDEX(' ', m1m) -1 end) as w2m
,SUBSTRING(m1m,CHARINDEX(' ',m1m)+1,100) as m2m
,*
FROM m1
),m3 as( --Retrieving third word of OWNER and OWNER minus third word
SELECT SUBSTRING(m2m,1,
case when CHARINDEX(' ', m2m) = 0 then LEN(m2m)
else CHARINDEX(' ', m2m) -1 end) as w3m
,SUBSTRING(m2m,CHARINDEX(' ',m2m)+1,100) as m3m
,*
FROM m2
),m4 as( --Retrieving fourth word of OWNER
SELECT SUBSTRING(m3m,1,
case when CHARINDEX(' ', m3m) = 0 then LEN(m3m)
else CHARINDEX(' ', m3m) -1 end) as w4m
,*
FROM m3
),ms as( --Adding special cases not caught by regular query
SELECT CASE WHEN [ID] IN ('3','5') THEN
CASE WHEN [ID] = '3' THEN 'YPD'
WHEN [ID] = '5' THEN 'OOO'
ELSE NULL END
ELSE NULL END as SPEC
,*
FROM m4
)
SELECT m.[ID] --Finding closest shortname
,m.[OWNER]
,p.[ITEM]
,CASE WHEN SUBSTRING(p.[ITEM],1,LEN(m.SPEC)) = SPEC AND SPEC IS NOT NULL THEN LTRIM(SUBSTRING(p.[ITEM],LEN(m.SPEC)+1,100)) --If hardcoded phrase in ITEM, return ITEM minus that phrase
WHEN p.w1p LIKE '%'+m.w1m+'%' AND p.w2p NOT LIKE '%'+m.w2m+'%' AND p.w3p NOT LIKE '%'+m.w3m+'%' AND p.w4p NOT LIKE '%'+m.w4m+'%' THEN p.m1p --If first word of ITEM match first of OWNER, return ITEM minus first
WHEN p.w1p LIKE '%'+m.w1m+'%' AND p.w2p LIKE '%'+m.w2m+'%' AND p.w3p NOT LIKE '%'+m.w3m+'%' AND p.w4p NOT LIKE '%'+m.w4m+'%' THEN p.m2p --If first two words of ITEM match first of OWNER, return ITEM minus two words
WHEN p.w1p LIKE '%'+m.w1m+'%' AND p.w2p LIKE '%'+m.w2m+'%' AND p.w3p LIKE '%'+m.w3m+'%' AND p.w4p NOT LIKE '%'+m.w4m+'%' THEN p.m3p --If first three words of ITEM match first of OWNER, return ITEM minus three words
WHEN p.w1p LIKE '%'+m.w1m+'%' AND p.w2p LIKE '%'+m.w2m+'%' AND p.w3p LIKE '%'+m.w3m+'%' AND p.w4p LIKE '%'+m.w4m+'%' THEN p.m4p --If first four words of ITEM match first of OWNER, return ITEM minus four words
ELSE p.[ITEM]
END AS [SHORT ITEM]
FROM p4 p
LEFT JOIN ms m ON p.[ID] = m.[ID]
虽然这实现了我的目标,但它看起来并不是很好并且感觉它可以被优化。它需要where语句才能具有任何执行速度。虽然我可能不会在完整的数据集上运行这个,但我正在寻找改进的方法。我无权查看执行计划,所以我无法分享。
感谢您提供任何帮助或建议。
答案 0 :(得分:0)
好的我只使用表格的前三行开发了这个。将#test替换为您的表名。如果这对您有用,请告诉我。
select *, row_number() over(order by id) rowid
into #a
from #test
select *, item short_item, row_number() over(order by id) rowid
into #b
from #test
declare @iterator int=1
declare @owner varchar(max)
declare @owneroriginal varchar(max)
declare @item varchar(max)
declare @itemoriginal varchar(max)
declare @itemactualoriginal varchar(max)
while @iterator<=(select max(rowid) from #a)
begin
select @owner=[owner] from #a where rowid=@iterator
select @owneroriginal=[owner] from #a where rowid=@iterator
select @item=[item] from #a where rowid=@iterator
select @itemoriginal=[item] from #a where rowid=@iterator
select @itemactualoriginal=[item] from #a where rowid=@iterator
while @owner<>''
begin
select @owner=left(@owneroriginal, charindex(' ',@owneroriginal))
select @owneroriginal= ltrim(replace(@owneroriginal,@owner,''))
select @item=left(@itemoriginal, charindex(' ',@itemoriginal))
select @itemoriginal= ltrim(replace(@itemoriginal,@item,''))
--select @owner, @owneroriginal, @item, @itemoriginal
if @itemactualoriginal
like '%'+rtrim(@owner)+'%' and @owner<>''
begin
--select 1
update #b
set short_item=replace(short_item, rtrim(@item),'')
where rowid=@iterator
end
else if @@rowcount=0
update #b
set short_item=
case
when @owner = ''
then ltrim(replace(short_item, @owneroriginal,''))
else ltrim(replace(short_item, @owner,''))
end
where rowid=@iterator
end
set @iterator=@iterator+1
end
select id, owner, item, short_item from #b