我有一个过程,用于加扰将实时数据移动到测试环境的事务数据。该表可容纳约。 1亿行分布在50个分区中。每个月都会添加一个新分区。随着音量的增加,过程执行得比以前慢。
我正在考虑在我的代码中引入某种程度的并行化。这是新领域,我想知道是否有最佳实践。也许使用dbms_parallel_execute将更新分成多个块?
非常感谢任何有关如何优化我的代码的建议!
PROCEDURE Scramble_Transactions
AS
vSeed BINARY_INTEGER;
CURSOR Transactions_cur
IS
SELECT T.ID,
T.MONTH_PARTITION,
T.TRACE_NUM,
T.TXTDATA
FROM TRANSACTIONS T;
TYPE TBL IS TABLE OF Transactions_cur%ROWTYPE
INDEX BY PLS_INTEGER;
Transactions_Rec TBL;
vCounter NUMBER (10);
vString VARCHAR2 (300);
vLen NUMBER (5);
vFromRange VARCHAR2 (25);
vToRange VARCHAR2 (25);
BEGIN
vCounter := 0;
SELECT SUBSTR (TO_CHAR (SYSDATE, 'ddmmyyyyhhmiss'), 11)
INTO vSeed
FROM DUAL;
DBMS_RANDOM.initialize (vSeed);
DBMS_RANDOM.SEED (vSeed);
vFromRange := 0;
OPEN Transactions_cur;
LOOP
FETCH Transactions_cur BULK COLLECT INTO Transactions_Rec LIMIT 10000;
FOR I IN 1 .. Transactions_Rec.COUNT
LOOP
IF Transactions_Rec (i).TRACE_NUM IS NOT NULL
THEN
vString := Transactions_Rec (i).TRACE_NUM;
vLen := LENGTH (TRIM (vString));
vToRange := POWER (10, vLen) - 1;
Transactions_Rec (i).TRACE_NUM :=
LPAD (TRUNC (DBMS_RANDOM.VALUE (vFromRange, vToRange)),
6,
'1');
END IF;
IF Transactions_Rec (i).TXTDATA IS NOT NULL
THEN
vString := Transactions_Rec (i).TXTDATA;
vLen := LENGTH (TRIM (vString));
vToRange := POWER (10, vLen) - 1;
Transactions_Rec (i).TXTDATA :=
LPAD (TRUNC (DBMS_RANDOM.VALUE (vFromRange, vToRange)),
12,
'3');
END IF;
vCounter := vCounter + 1;
END LOOP;
FORALL rec IN 1 .. Transactions_Rec.COUNT
UPDATE Transactions
SET TRACE_NUM = Transactions_Rec (rec).TRACE_NUM,
TXTDATA = Transactions_Rec (rec).TXTDATA
WHERE ID = Transactions_Rec (rec).ID
AND MONTH_PARTITION = Transactions_Rec (rec).MONTH_PARTITION;
EXIT WHEN Transactions_cur%NOTFOUND;
END LOOP;
DBMS_RANDOM.TERMINATE;
CLOSE Transactions_cur;
COMMIT;
END Scramble_Transactions;
编辑,我的解决方案基于以下反馈: 重写过程的一部分,以便完成数据加扰作为SQL(而不是PL / SQL)的一部分。现在,过程还将从/到分区作为允许并行处理的参数。
CREATE OR REPLACE PROCEDURE Scramble_Transactions(P_MONTH_PARTITION_FROM VARCHAR2, P_MONTH_PARTITION_FROM VARCHAR2)
AS
CURSOR Transactions_cur (V_MONTH_PARTITION_FROM TRANSACTIONS.MONTH_PARTITION%TYPE,
V_MONTH_PARTITION_TO TRANSACTIONS.MONTH_PARTITION%TYPE) IS
SELECT T.ID,
T.MONTH_PARTITION,
REGEXP_REPLACE(T.TRACE_NUM,'[0-9]','9') TRACE_NUM,
REGEXP_REPLACE(T.TXTDATA,'[0-9]','9') TXTDATA
FROM TRANSACTIONS T WHERE T.MONTH_PARTITION BETWEEN P_MONTH_PARTITION_FROM AND P_MONTH_PARTITION_FROM ;
TYPE TBL IS TABLE OF Transactions_cur%ROWTYPE
INDEX BY PLS_INTEGER;
Transactions_Rec TBL;
BEGIN
OPEN Transactions_cur(P_MONTH_PARTITION_FROM,P_MONTH_PARTITION_FROM);
LOOP
FETCH Transactions_cur BULK COLLECT INTO Transactions_Rec LIMIT 10000;
/*Some additional processing*/
FORALL rec IN 1 .. Transactions_Rec.COUNT
UPDATE Transactions
SET TRACE_NUM = Transactions_Rec (rec).TRACE_NUM,
TXTDATA = Transactions_Rec (rec).TXTDATA
WHERE ID = Transactions_Rec (rec).ID
AND MONTH_PARTITION = Transactions_Rec (rec).MONTH_PARTITION;
EXIT WHEN Transactions_cur%NOTFOUND;
END LOOP;
CLOSE Transactions_cur;
COMMIT;
END;
/
现在通过使用DBMS_PARALLEL_EXECUTE并行执行该过程。根据partitionkey将查询分为多个块。
DECLARE
L_TASK_SQL CLOB;
V_TASKNAME USER_PARALLEL_EXECUTE_TASKS.TASK_NAME%TYPE;
V_STATUS USER_PARALLEL_EXECUTE_TASKS.STATUS%TYPE;
C_TASK_NAME VARCHAR2(50) := 'TRANSACTIONS_TASK';
BEGIN
L_TASK_SQL := 'SELECT PARTITION_NAME, PARTITION_NAME FROM USER_TAB_PARTITIONS WHERE TABLE_NAME = ''TRANSACTIONS''';
DBMS_PARALLEL_EXECUTE.CREATE_TASK(C_TASK_NAME);
DBMS_PARALLEL_EXECUTE.CREATE_CHUNKS_BY_SQL(
TASK_NAME => 'TRANSACTIONS_TASK',
SQL_STMT => L_TASK_SQL,
BY_ROWID => FALSE);
DBMS_PARALLEL_EXECUTE.RUN_TASK(
TASK_NAME => C_TASK_NAME,
SQL_STMT => 'BEGIN SCRAMBLE_TRANSACTIONS( :START_ID, :END_ID ); END;',
LANGUAGE_FLAG => DBMS_SQL.NATIVE,
PARALLEL_LEVEL => 6);
SELECT TASK_NAME, STATUS INTO V_TASKNAME,V_STATUS FROM USER_PARALLEL_EXECUTE_TASKS WHERE TASK_NAME = C_TASK_NAME;
DBMS_OUTPUT.PUT_LINE('TASK:'|| 'V_TASKNAME' ||' , STATUS:'|| V_STATUS);
DBMS_PARALLEL_EXECUTE.DROP_CHUNKS(TASK_NAME => 'TRANSACTIONS_TASK');
DBMS_PARALLEL_EXECUTE.DROP_TASK(TASK_NAME => 'TRANSACTIONS_TASK');
END;
/
总体总执行时间从之前的13-14小时降低到30分钟。
答案 0 :(得分:2)
SQL是一个不错的选择,但是也许一个非常快速的解决方案是您正在更新要从中获取的表。这可能会导致巨大的撤消问题,因为提取必须提供与时间点一致的结果集。因此,每次绕取循环时,您可能会做越来越多的工作(撤消刚刚完成的更新)。当然,提交每个循环会产生错误时的可重新启动性问题。所以也许一次分区,不循环,例如
PROCEDURE Scramble_Transactions(p_parname varchar2) AS
vSeed BINARY_INTEGER;
Transactions_cur sys_refcursor;
CURSOR Transactions_cur_template
IS
SELECT T.ID,
T.MONTH_PARTITION,
T.TRACE_NUM,
T.TXTDATA
FROM TRANSACTIONS T;
TYPE TBL IS TABLE OF Transactions_cur_template%ROWTYPE INDEX BY PLS_INTEGER;
Transactions_Rec TBL;
vCounter NUMBER (10);
vString VARCHAR2 (300);
vLen NUMBER (5);
vFromRange VARCHAR2 (25);
vToRange VARCHAR2 (25);
BEGIN
vCounter := 0;
SELECT SUBSTR (TO_CHAR (SYSDATE, 'ddmmyyyyhhmiss'), 11)
INTO vSeed
FROM DUAL;
DBMS_RANDOM.initialize (vSeed);
DBMS_RANDOM.SEED (vSeed);
vFromRange := 0;
OPEN Transactions_cur for ' SELECT T.ID,
T.MONTH_PARTITION,
T.TRACE_NUM,
T.TXTDATA
FROM TRANSACTIONS T partition ('||p_parname||') where TRACE_NUM IS NOT NULL or TXTDATA IS NOT NULL';
FETCH Transactions_cur BULK COLLECT INTO Transactions_Rec;
FOR I IN 1 .. Transactions_Rec.COUNT
LOOP
IF Transactions_Rec (i).TRACE_NUM IS NOT NULL
THEN
vString := Transactions_Rec (i).TRACE_NUM;
vLen := LENGTH (TRIM (vString));
vToRange := POWER (10, vLen) - 1;
Transactions_Rec (i).TRACE_NUM :=
LPAD (TRUNC (DBMS_RANDOM.VALUE (vFromRange, vToRange)),
6,
'1');
END IF;
IF Transactions_Rec (i).TXTDATA IS NOT NULL
THEN
vString := Transactions_Rec (i).TXTDATA;
vLen := LENGTH (TRIM (vString));
vToRange := POWER (10, vLen) - 1;
Transactions_Rec (i).TXTDATA :=
LPAD (TRUNC (DBMS_RANDOM.VALUE (vFromRange, vToRange)),
12,
'3');
END IF;
vCounter := vCounter + 1;
END LOOP;
FORALL rec IN 1 .. Transactions_Rec.COUNT
UPDATE Transactions
SET TRACE_NUM = Transactions_Rec (rec).TRACE_NUM,
TXTDATA = Transactions_Rec (rec).TXTDATA
WHERE ID = Transactions_Rec (rec).ID
AND MONTH_PARTITION = Transactions_Rec (rec).MONTH_PARTITION;
DBMS_RANDOM.TERMINATE;
CLOSE Transactions_cur;
COMMIT;
END Scramble_Transactions;
因此,只需更改几行代码,我们就可以
然后您可以为每个分区名称提交一个作业(使用DBMS_SCHEDULER),并且由于我们现在正在隔离每个分区,因此我们不会在各个作业之间引起争用。
不要误会我-完全重构SQL也许仍然是最好的选择,但是就快速获胜而言,上面的代码可以用最少的更改来解决您的问题。
答案 1 :(得分:0)
我认为,如果使用CTAS(创建表...作为选择)或插入/ + * append * / ...而不是更新时,性能会更好。由于您的数据已分区,因此可以使用分区交换。这将使您与直接路径加载操作一起更有效地使用并行性。