SQL从更改日志

时间:2016-06-12 12:43:23

标签: mysql sql data-warehouse

有一张表A。表A中的一行如下所示:

+----+---------+---------+---------+------------+------------+------------------+------------------+
| id | value_a | value_b | value_c | created_on | created_by | last_modified_on | last_modified_by |
+----+---------+---------+---------+------------+------------+------------------+------------------+
| 42 | x       | y       | z       | 2016-04-01 | Maria      | 2016-05-01       | Jim              |
+----+---------+---------+---------+------------+------------+------------------+------------------+

因此,表A仅包含最新值。

还有一个名为changelog的表格。它存储有关表A的所有更改/更新。表changelog的{​​{1}}条记录如下所示:

A

我需要创建一个+-----+-----------+--------+---------+-----------+-----------------------------------------+------------+------------+ | id | object_id | action | field | old_value | new_value | created_on | created_by | +-----+-----------+--------+---------+-----------+-----------------------------------------+------------+------------+ | 234 | 42 | insert | NULL | NULL | {value_a: xx, value_b: yy, value_c: zz} | 2016-04-01 | Maria | | 456 | 42 | update | value_a | xx | x | 2016-04-05 | Bob | | 467 | 42 | update | value_b | yy | y | 2016-05-01 | Jim | | 678 | 42 | update | value_c | zz | z | 2016-05-01 | Jim | +-----+-----------+--------+---------+-----------+-----------------------------------------+------------+------------+ 表,对于这个特定记录,它将如下所示:

historical_A

+----+---------+---------+---------+------------+------------+------------+--------------+ | id | value_a | value_b | value_c | valid_from | created_by | valid_to | modified_by | +----+---------+---------+---------+------------+------------+------------+--------------+ | 42 | xx | yy | zz | 2016-04-01 | Maria | 2016-04-05 | Bob | | 42 | x | yy | zz | 2016-04-05 | Bon | 2016-05-01 | Jim | | 42 | x | y | z | 2016-05-01 | Jim | | | +----+---------+---------+---------+------------+------------+------------+--------------+ 大约有1 500 000行,表A的{​​{1}}表大约有27 000 000行。

目前我正在使用SQL和Python脚本进行初始转换(加载)。基本上我为初始行生成一个insert语句(通过解析json),然后生成changelog表的A列分组的所有后续插入语句。 目前,我需要大约3分钟来处理1000行表created_on。因此,我正在并行化(x10)我的脚本执行以更及时地获得结果。

我怀疑Sql + Python脚本不是问题的最佳解决方案。是否有针对所提出问题的纯SQL解决方案? 是否有针对此类问题的既定最佳实践?

1 个答案:

答案 0 :(得分:-1)

不幸的是我的MYSQL盒坏了,所以我在SQL Server中完成了这个,但我不认为代码中存在任何兼容性问题。如果它适合你并且它的表现如何,我会感兴趣。您可能需要添加索引以加快性能。 - SQL Restoring historical data from the changelog

/*
create table a
( id int, value_a varchar(20), value_b varchar(20), value_c varchar(20),
created_on date, created_by varchar(20), last_modified_on date, last_modified_by varchar(20));

create table changelog 
( id int,  object_id int, action varchar(20),  field varchar(20) , old_value varchar(20), new_value varchar(50), created_on date, created_by varchar(20));

create table history_work
(changeid int,objectid int, value_a varchar(20), value_b varchar(20), value_c varchar(20), value_a_new varchar(20), value_b_new varchar(20), value_c_new varchar(20),
created_on date, created_by varchar(20), last_modified_on date, last_modified_by varchar(20));

CREATE TABLE `history` (
    `changeid` INT(11) NULL DEFAULT NULL,
    `objectid` INT(11) NULL DEFAULT NULL,
    `value_a` VARCHAR(20) NULL DEFAULT NULL,
    `value_b` VARCHAR(20) NULL DEFAULT NULL,
    `value_c` VARCHAR(20) NULL DEFAULT NULL,
    `valid_from` DATE NULL DEFAULT NULL,
    `created_by` VARCHAR(20) NULL DEFAULT NULL,
    `valid_to` DATE NULL DEFAULT NULL,
    `last_modified_by` VARCHAR(20) NULL DEFAULT NULL
)
COLLATE='latin1_swedish_ci'
ENGINE=InnoDB

drop table if exists t;
CREATE TABLE `t` (
    `changeid` INT(11) NULL DEFAULT NULL,
    `objectid` INT(11) NULL DEFAULT NULL,
    `value_a` VARCHAR(20) NULL DEFAULT NULL,
    `value_b` VARCHAR(20) NULL DEFAULT NULL,
    `value_c` VARCHAR(20) NULL DEFAULT NULL,
    `value_a_new` VARCHAR(20) NULL DEFAULT NULL,
    `value_b_new` VARCHAR(20) NULL DEFAULT NULL,
    `value_c_new` VARCHAR(20) NULL DEFAULT NULL,
    `created_on` DATE NULL DEFAULT NULL,
    `created_by` VARCHAR(20) NULL DEFAULT NULL,
    `last_modified_on` DATE NULL DEFAULT NULL,
    `last_modified_by` VARCHAR(20) NULL DEFAULT NULL
)
COLLATE='latin1_swedish_ci'
ENGINE=InnoDB
;
;
expected result
+----+---------+---------+---------+------------+------------+------------+--------------+
| id | value_a | value_b | value_c | valid_from | created_by |  valid_to  | modified_by  |
+----+---------+---------+---------+------------+------------+------------+--------------+
| 42 | xx      | yy      | zz      | 2016-04-01 | Maria      | 2016-04-05 | Bob          |
| 42 | x       | yy      | zz      | 2016-04-05 | Bon        | 2016-05-01 | Jim          |
| 42 | x       | y       | z       | 2016-05-01 | Jim        |            |              |
+----+---------+---------+---------+------------+------------+------------+--------------+

*/

truncate table a;
truncate table changelog;
truncate table history_work;
Insert into a values
( 42 , 'x'  ,      'y'       , 'z'       ,'2016-04-01' ,'Maria','2016-05-01', 'Jim');

insert into changelog values
( 234  ,       42 , 'insert' , NULL        , NULL      ,   '{value_a: xx, value_b: yy, value_c: zz}' , '2016-04-01',  'Maria'),      
( 456  ,       42 , 'update' , 'value_a' ,'xx',         'x',                                       '2016-04-05',  'Bob'  ),      
( 467  ,       42 , 'update' , 'value_b'  ,'yy',         'y',                                       '2016-05-01',  'Jim'  ),      
( 678  ,       42 , 'update' , 'value_c'  ,'zz',         'z',                                       '2016-05-01',  'Jim'  )  ;  

/*Dummy Insert record*/
insert into history_work
(changeid ,objectid,
#, value_a , value_b , value_c, 
created_on , created_by, last_modified_on,last_modified_by
)
select
000,id, #, value_a , value_b , value_c, 
created_on, created_by, last_modified_on, last_modified_by
from    a;
/*
insert into history_work
(changeid ,objectid , value_a , value_b , value_c, created_on , created_by, last_modified_on,last_modified_by)
select
999,id , value_a , value_b , value_c, created_on, created_by, last_modified_on, last_modified_by
from    a
*/

insert into history_work
(changeid ,objectid , value_a , value_b , value_c, value_a_new, value_b_new , value_c_new, 
created_on , created_by, last_modified_on,last_modified_by)
select  a.id, 
        a.object_id,
        case 
            when field = 'value_a' then a.old_value
            else null
        end,
        case 
            when field = 'value_b' then a.old_value
            else null
        end,
        case 
            when field = 'value_c' then a.old_value
            else null
        end,
        case 
            when field = 'value_a' then a.new_value
            else null
        end,
        case 
            when field = 'value_b' then a.new_value
            else null
        end,
        case 
            when field = 'value_c' then a.new_value
            else null
        end,
        a.created_on,a.created_by,
        a.created_on,a.created_by
from    changelog a
#join  history_work h on h.objectid = a.object_id and h.changeid = 999
where   action <> 'insert';

/*Derive Insert values from first old_value*/
truncate table t;
insert into t
(changeid, objectid)
select distinct 0,objectid from history_work;

update t
set value_a = (select hw.value_a from history_work hw 
                        where hw.objectid = t.objectid
                        and hw.changeid = (select min(changeid) from history_work a where a.objectid = hw.objectid and a.value_a is not null)),
        value_b = (select hw.value_b from history_work hw 
                        where hw.objectid = t.objectid
                        and hw.changeid = (select min(changeid) from history_work a where a.objectid = hw.objectid and a.value_b is not null)), 
        value_c = (select hw.value_c from history_work hw 
                        where hw.objectid = t.objectid
                        and hw.changeid = (select min(changeid) from history_work a where a.objectid = hw.objectid and a.value_c is not null)); 


update  history_work h
join        t on t.objectid  = h.objectid
set     h.value_a = t.value_a, h.value_b = t.value_b, h.value_c = t.value_c
where       h.changeid = 0;

#select     * from history_work;

/*Get Changes*/
update history_work set value_a = value_a_new where value_a_new is not null;
update history_work set value_b = value_b_new where value_b_new is not null;
update history_work set value_c = value_c_new where value_c_new is not null;

/*Downfill and create final table*/
truncate table history;
insert  into history
(   `changeid` ,
    `objectid` ,
    `value_a`  ,
    `value_b`  ,
    `value_c`  ,
    `valid_from` ,
    `created_by` ,
    `valid_to`   ,
    `last_modified_by`
)
select  h.changeid,h.objectid , 
        (select a.value_a from history_work a where a.changeid = 
        (select max(changeid) from history_work h1 where h1.objectid = h.objectid and h1.value_a is not null and h1.changeid <= h.changeid)
        ) value_a,

        (select a.value_b from history_work a where a.changeid = 
        (select max(changeid) from history_work h1 where h1.objectid = h.objectid and h1.value_b is not null and h1.changeid <= h.changeid)
        ) value_b,

        (select a.value_c from history_work a where a.changeid = 
        (select max(changeid) from history_work h1 where h1.objectid = h.objectid and h1.value_c is not null and  h1.changeid <= h.changeid)
        ) value_c,

        h.created_on,h.created_by,h.last_modified_on,h.last_modified_by
from    history_work h
where   h.changeid in (select maxid from    
        (select     a.created_on, a.created_by,a.object_id, min(id) minid,max(a.id) maxid
        from        changelog a
        group by    a.created_on, a.created_by,a.object_id) s
        ) 
        or h.changeid = 0
order   by  h.changeid;

truncate table t;

insert into t
(changeid, objectid,value_a,value_b,value_c,created_on,created_by,last_modified_on,last_modified_by)
select changeid,objectid,
         value_a,
         value_b,
         value_c,
         valid_from,created_by,
         valid_to,last_modified_by
from     history
;

update  history h
    set h.valid_to = 
        ((select a.created_on from t a where a.changeid = (select min(b.changeid) from t b where b.objectid = a.objectid and b.changeid > h.changeid))),
        last_modified_by = 
        (select a.created_by from  t a where a.changeid = (select min(changeid) from t b where b.objectid = a.objectid and b.changeid > h.changeid))
;
select  * from history;