有一张表A
。表A
中的一行如下所示:
+----+---------+---------+---------+------------+------------+------------------+------------------+
| id | value_a | value_b | value_c | created_on | created_by | last_modified_on | last_modified_by |
+----+---------+---------+---------+------------+------------+------------------+------------------+
| 42 | x | y | z | 2016-04-01 | Maria | 2016-05-01 | Jim |
+----+---------+---------+---------+------------+------------+------------------+------------------+
因此,表A
仅包含最新值。
还有一个名为changelog
的表格。它存储有关表A
的所有更改/更新。表changelog
的{{1}}条记录如下所示:
A
我需要创建一个+-----+-----------+--------+---------+-----------+-----------------------------------------+------------+------------+
| id | object_id | action | field | old_value | new_value | created_on | created_by |
+-----+-----------+--------+---------+-----------+-----------------------------------------+------------+------------+
| 234 | 42 | insert | NULL | NULL | {value_a: xx, value_b: yy, value_c: zz} | 2016-04-01 | Maria |
| 456 | 42 | update | value_a | xx | x | 2016-04-05 | Bob |
| 467 | 42 | update | value_b | yy | y | 2016-05-01 | Jim |
| 678 | 42 | update | value_c | zz | z | 2016-05-01 | Jim |
+-----+-----------+--------+---------+-----------+-----------------------------------------+------------+------------+
表,对于这个特定记录,它将如下所示:
historical_A
表+----+---------+---------+---------+------------+------------+------------+--------------+
| id | value_a | value_b | value_c | valid_from | created_by | valid_to | modified_by |
+----+---------+---------+---------+------------+------------+------------+--------------+
| 42 | xx | yy | zz | 2016-04-01 | Maria | 2016-04-05 | Bob |
| 42 | x | yy | zz | 2016-04-05 | Bon | 2016-05-01 | Jim |
| 42 | x | y | z | 2016-05-01 | Jim | | |
+----+---------+---------+---------+------------+------------+------------+--------------+
大约有1 500 000行,表A
的{{1}}表大约有27 000 000行。
目前我正在使用SQL和Python脚本进行初始转换(加载)。基本上我为初始行生成一个insert语句(通过解析json),然后生成changelog
表的A
列分组的所有后续插入语句。
目前,我需要大约3分钟来处理1000行表created_on
。因此,我正在并行化(x10)我的脚本执行以更及时地获得结果。
我怀疑Sql + Python脚本不是问题的最佳解决方案。是否有针对所提出问题的纯SQL解决方案? 是否有针对此类问题的既定最佳实践?
答案 0 :(得分:-1)
不幸的是我的MYSQL盒坏了,所以我在SQL Server中完成了这个,但我不认为代码中存在任何兼容性问题。如果它适合你并且它的表现如何,我会感兴趣。您可能需要添加索引以加快性能。 - SQL Restoring historical data from the changelog
/*
create table a
( id int, value_a varchar(20), value_b varchar(20), value_c varchar(20),
created_on date, created_by varchar(20), last_modified_on date, last_modified_by varchar(20));
create table changelog
( id int, object_id int, action varchar(20), field varchar(20) , old_value varchar(20), new_value varchar(50), created_on date, created_by varchar(20));
create table history_work
(changeid int,objectid int, value_a varchar(20), value_b varchar(20), value_c varchar(20), value_a_new varchar(20), value_b_new varchar(20), value_c_new varchar(20),
created_on date, created_by varchar(20), last_modified_on date, last_modified_by varchar(20));
CREATE TABLE `history` (
`changeid` INT(11) NULL DEFAULT NULL,
`objectid` INT(11) NULL DEFAULT NULL,
`value_a` VARCHAR(20) NULL DEFAULT NULL,
`value_b` VARCHAR(20) NULL DEFAULT NULL,
`value_c` VARCHAR(20) NULL DEFAULT NULL,
`valid_from` DATE NULL DEFAULT NULL,
`created_by` VARCHAR(20) NULL DEFAULT NULL,
`valid_to` DATE NULL DEFAULT NULL,
`last_modified_by` VARCHAR(20) NULL DEFAULT NULL
)
COLLATE='latin1_swedish_ci'
ENGINE=InnoDB
drop table if exists t;
CREATE TABLE `t` (
`changeid` INT(11) NULL DEFAULT NULL,
`objectid` INT(11) NULL DEFAULT NULL,
`value_a` VARCHAR(20) NULL DEFAULT NULL,
`value_b` VARCHAR(20) NULL DEFAULT NULL,
`value_c` VARCHAR(20) NULL DEFAULT NULL,
`value_a_new` VARCHAR(20) NULL DEFAULT NULL,
`value_b_new` VARCHAR(20) NULL DEFAULT NULL,
`value_c_new` VARCHAR(20) NULL DEFAULT NULL,
`created_on` DATE NULL DEFAULT NULL,
`created_by` VARCHAR(20) NULL DEFAULT NULL,
`last_modified_on` DATE NULL DEFAULT NULL,
`last_modified_by` VARCHAR(20) NULL DEFAULT NULL
)
COLLATE='latin1_swedish_ci'
ENGINE=InnoDB
;
;
expected result
+----+---------+---------+---------+------------+------------+------------+--------------+
| id | value_a | value_b | value_c | valid_from | created_by | valid_to | modified_by |
+----+---------+---------+---------+------------+------------+------------+--------------+
| 42 | xx | yy | zz | 2016-04-01 | Maria | 2016-04-05 | Bob |
| 42 | x | yy | zz | 2016-04-05 | Bon | 2016-05-01 | Jim |
| 42 | x | y | z | 2016-05-01 | Jim | | |
+----+---------+---------+---------+------------+------------+------------+--------------+
*/
truncate table a;
truncate table changelog;
truncate table history_work;
Insert into a values
( 42 , 'x' , 'y' , 'z' ,'2016-04-01' ,'Maria','2016-05-01', 'Jim');
insert into changelog values
( 234 , 42 , 'insert' , NULL , NULL , '{value_a: xx, value_b: yy, value_c: zz}' , '2016-04-01', 'Maria'),
( 456 , 42 , 'update' , 'value_a' ,'xx', 'x', '2016-04-05', 'Bob' ),
( 467 , 42 , 'update' , 'value_b' ,'yy', 'y', '2016-05-01', 'Jim' ),
( 678 , 42 , 'update' , 'value_c' ,'zz', 'z', '2016-05-01', 'Jim' ) ;
/*Dummy Insert record*/
insert into history_work
(changeid ,objectid,
#, value_a , value_b , value_c,
created_on , created_by, last_modified_on,last_modified_by
)
select
000,id, #, value_a , value_b , value_c,
created_on, created_by, last_modified_on, last_modified_by
from a;
/*
insert into history_work
(changeid ,objectid , value_a , value_b , value_c, created_on , created_by, last_modified_on,last_modified_by)
select
999,id , value_a , value_b , value_c, created_on, created_by, last_modified_on, last_modified_by
from a
*/
insert into history_work
(changeid ,objectid , value_a , value_b , value_c, value_a_new, value_b_new , value_c_new,
created_on , created_by, last_modified_on,last_modified_by)
select a.id,
a.object_id,
case
when field = 'value_a' then a.old_value
else null
end,
case
when field = 'value_b' then a.old_value
else null
end,
case
when field = 'value_c' then a.old_value
else null
end,
case
when field = 'value_a' then a.new_value
else null
end,
case
when field = 'value_b' then a.new_value
else null
end,
case
when field = 'value_c' then a.new_value
else null
end,
a.created_on,a.created_by,
a.created_on,a.created_by
from changelog a
#join history_work h on h.objectid = a.object_id and h.changeid = 999
where action <> 'insert';
/*Derive Insert values from first old_value*/
truncate table t;
insert into t
(changeid, objectid)
select distinct 0,objectid from history_work;
update t
set value_a = (select hw.value_a from history_work hw
where hw.objectid = t.objectid
and hw.changeid = (select min(changeid) from history_work a where a.objectid = hw.objectid and a.value_a is not null)),
value_b = (select hw.value_b from history_work hw
where hw.objectid = t.objectid
and hw.changeid = (select min(changeid) from history_work a where a.objectid = hw.objectid and a.value_b is not null)),
value_c = (select hw.value_c from history_work hw
where hw.objectid = t.objectid
and hw.changeid = (select min(changeid) from history_work a where a.objectid = hw.objectid and a.value_c is not null));
update history_work h
join t on t.objectid = h.objectid
set h.value_a = t.value_a, h.value_b = t.value_b, h.value_c = t.value_c
where h.changeid = 0;
#select * from history_work;
/*Get Changes*/
update history_work set value_a = value_a_new where value_a_new is not null;
update history_work set value_b = value_b_new where value_b_new is not null;
update history_work set value_c = value_c_new where value_c_new is not null;
/*Downfill and create final table*/
truncate table history;
insert into history
( `changeid` ,
`objectid` ,
`value_a` ,
`value_b` ,
`value_c` ,
`valid_from` ,
`created_by` ,
`valid_to` ,
`last_modified_by`
)
select h.changeid,h.objectid ,
(select a.value_a from history_work a where a.changeid =
(select max(changeid) from history_work h1 where h1.objectid = h.objectid and h1.value_a is not null and h1.changeid <= h.changeid)
) value_a,
(select a.value_b from history_work a where a.changeid =
(select max(changeid) from history_work h1 where h1.objectid = h.objectid and h1.value_b is not null and h1.changeid <= h.changeid)
) value_b,
(select a.value_c from history_work a where a.changeid =
(select max(changeid) from history_work h1 where h1.objectid = h.objectid and h1.value_c is not null and h1.changeid <= h.changeid)
) value_c,
h.created_on,h.created_by,h.last_modified_on,h.last_modified_by
from history_work h
where h.changeid in (select maxid from
(select a.created_on, a.created_by,a.object_id, min(id) minid,max(a.id) maxid
from changelog a
group by a.created_on, a.created_by,a.object_id) s
)
or h.changeid = 0
order by h.changeid;
truncate table t;
insert into t
(changeid, objectid,value_a,value_b,value_c,created_on,created_by,last_modified_on,last_modified_by)
select changeid,objectid,
value_a,
value_b,
value_c,
valid_from,created_by,
valid_to,last_modified_by
from history
;
update history h
set h.valid_to =
((select a.created_on from t a where a.changeid = (select min(b.changeid) from t b where b.objectid = a.objectid and b.changeid > h.changeid))),
last_modified_by =
(select a.created_by from t a where a.changeid = (select min(changeid) from t b where b.objectid = a.objectid and b.changeid > h.changeid))
;
select * from history;