我需要合并七个表。 每个表都有三个键和一个日期。 我一直在尝试合并数据的几种方法,包括将所有记录插入到单个表中,并运行重复数据删除T-SQL CTE语句,如下所示,由堆栈溢出的另一个成员给出:
;WITH CTE AS(
SELECT [key1], [key2], [key3], [date],
RN = ROW_NUMBER()OVER(PARTITION BY [key1], [key2], [key3] ORDER BY isnull([date], '19000101' desc)
FROM dbo.Table1 t1
WHERE EXISTS ( SELECT *
FROM dbo.Table1 t2
WHERE t1.key1=t2.key1
and t1.key2=t2.key2
and t1.key3=t2.key3
and t1.[date] IS NOT NULL
)
)
DELETE FROM CTE WHERE RN > 1
我的问题是每个表都是80Mil记录强,导致重载SQL Server的重复工作。
我的规则如下:
CASE 1: before dedupication:
key1 key2 key3 date
1 A 1 null
1 A 1 null
1 A 1 null
after deduplication:
key1 key2 key3 date
1 A 1 null
1 A 1 null
1 A 1 null
CASE 2: before dedupication:
key1 key2 key3 date
1 A 1 1/1/2016
1 A 1 1/1/2016
1 A 1 1/1/2016
after deduplication:
key1 key2 key3 date
1 A 1 1/1/2016
CASE 3: before dedupication:
key1 key2 key3 date
1 A 1 1/1/2016
1 A 1 1/2/2016
1 A 1 1/3/2016
after deduplication:
key1 key2 key3 date
1 A 1 1/3/2016
CASE 4: before deduplication
1 A 1 1/1/2016
1 A 1 1/1/2016
1 A 1 null
after deduplication:
key1 key2 key3 date
1 A 1 1/1/2016
CASE 5: before deduplication
1 A 1 1/1/2016
1 A 1 1/2/2016
1 A 1 null
after deduplication:
key1 key2 key3 date
1 A 1 1/2/2016
正如您所看到的,这几乎是一个SCD2场景加上NULL处理问题。
我的下一步尝试是将一个表合并到另一个表中并进行重复数据删除,然后重复,直到完成所有操作。
我正在查看合并解决方案,但无法理解如何为这么多记录工作。我正在游说SSIS,但无济于事。
我正在寻找任何可能的想法来执行此任务。 谢谢。
答案 0 :(得分:0)
我希望我能正确理解你的目的。
抱歉,无法保留格式,或者至少我不能。 我建议你尝试过去sql server manager中的内容。
考虑到数据负载,我建议您考虑实施一些中间步骤并合并临时表中的相关数据。 如果可能,还要考虑使用NoLock提示。
create table#tmp1(key1 int not null,key2 char(1)not null,key3 int not null,key4 date null)
create table #tmp2(key1 int not null,key2 char(1)not null,key3 int not null,key4 date null)
create table#tmp3(key1 int not null,key2 char(1)not null,key3 int not null,key4 date null)
create table#tmp4(key1 int not null,key2 char(1)not null,key3 int not null,key4 date null)
create table#tmp5(key1 int not null,key2 char(1)not null,key3 int not null,key4 date null)
插入#tmp1 values(1,'A',1,null),(1,'A',1,null),(1,'A',1,null)
插入#tmp2 值(1,'A',1,'2016-01-01'),(1,'A',1,'2016-01-01'),(1,'A',1,'2016-01 -01' )
插入#tmp3 值(1,'A',1,'2016-01-01'),(1,'A',1,'2016-01-02'),(1,'A',1,'2016-01 -03' )
插入#tmp4 值(1,'A',1,'2016-01-01'),(1,'A',1,'2016-01-01'),(1,'A',1,null)
插入#tmp5 值(1,'A',1,'2016-01-01'),(1,'A',1,'2016-01-02'),(1,'A',1,null)
** - 第一种回报(我对你的要求有疑问)
选择key1,key2,key3,key4
from(select key1,key2,key3,key4 = max(key4)
from #tmp1
where key4 is not null
group by key1, key2, key3
union all
select key1, key2, key3, key4 = max(key4)
from #tmp2
where key4 is not null
group by key1, key2, key3
union all
select key1, key2, key3, key4 = max(key4)
from #tmp3
where key4 is not null
group by key1, key2, key3
union all
select key1, key2, key3, key4 = max(key4)
from #tmp4
where key4 is not null
group by key1, key2, key3
union all
select key1, key2, key3, key4 = max(key4)
from #tmp5
where key4 is not null
group by key1, key2, key3
-- nulls, do not deduplicate
union all
select a.key1, a.key2, a.key3, a.key4
from #tmp1 a
left join #tmp1 b on a.key1 = b.key1 and a.key2 = b.key2 and a.key3 = b.key3 and b.key4 is not null
where a.key4 is null and b.key4 is null
union all
select a.key1, a.key2, a.key3, a.key4
from #tmp2 a
left join #tmp2 b on a.key1 = b.key1 and a.key2 = b.key2 and a.key3 = b.key3 and b.key4 is not null
where a.key4 is null and b.key4 is null
union all
select a.key1, a.key2, a.key3, a.key4
from #tmp3 a
left join #tmp3 b on a.key1 = b.key1 and a.key2 = b.key2 and a.key3 = b.key3 and b.key4 is not null
where a.key4 is null and b.key4 is null
union all
select a.key1, a.key2, a.key3, a.key4
from #tmp4 a
left join #tmp4 b on a.key1 = b.key1 and a.key2 = b.key2 and a.key3 = b.key3 and b.key4 is not null
where a.key4 is null and b.key4 is null
union all
select a.key1, a.key2, a.key3, a.key4
from #tmp5 a
left join #tmp5 b on a.key1 = b.key1 and a.key2 = b.key2 and a.key3 = b.key3 and b.key4 is not null
where a.key4 is null and b.key4 is null
) a
** - 第二种类型的退货(我怀疑你的要求)
选择key1,key2,key3,key4 = max(key4) from(选择key1,key2,key3,key4 来自#tmp1 其中key4不为null 联合所有 选择key1,key2,key3,key4 来自#tmp2 其中key4不为null 联合所有 选择key1,key2,key3,key4 来自#tmp3 其中key4不为null 联合所有 选择key1,key2,key3,key4 来自#tmp4 其中key4不为null 联合所有 选择key1,key2,key3,key4 来自#tmp5 其中key4不为null )nonnulls 按键1,键2,键3组 联合所有 - nulls,不进行重复数据删除 选择a.key1,a.key2,a.key3,a.key4 来自#tmp1 a 左连接#tmp1 b在a.key1 = b.key1和a.key2 = b.key2和a.key3 = b.key3和b.key4不为空 其中a.key4为null,b.key4为null 联合所有 选择a.key1,a.key2,a.key3,a.key4 来自#tmp2 a 左连接#tmp2 b on a.key1 = b.key1和a.key2 = b.key2和a.key3 = b.key3和b.key4不为空 其中a.key4为null,b.key4为null 联合所有 选择a.key1,a.key2,a.key3,a.key4
来自#tmp3 a 左连接#tmp3 b on a.key1 = b.key1和a.key2 = b.key2和a.key3 = b.key3和b.key4不为空 其中a.key4为null,b.key4为null 联合所有 选择a.key1,a.key2,a.key3,a.key4 来自#tmp4 a 左连接#tmp4 b on a.key1 = b.key1和a.key2 = b.key2和a.key3 = b.key3和b.key4不为空 其中a.key4为null,b.key4为null 联合所有 选择a.key1,a.key2,a.key3,a.key4 来自#tmp5 a 左连接#tmp5 b on a.key1 = b.key1和a.key2 = b.key2和a.key3 = b.key3和b.key4不为空 其中a.key4为null,b.key4为null