create or replace procedure prcdr_Clustering is
v_sampleCount number;
v_sampleFlag number;
v_matchPercent number;
v_SpendAmount Number(18, 2);
cursor cur_PDCSample is
SELECT *
FROM TBL_BIL
WHERE UDF_CHK = 'N';
rec_Pdcsample TBL_BIL%rowtype;
BEGIN
OPEN cur_PDCSample;
LOOP
FETCH cur_PDCSample
into rec_Pdcsample;
EXIT WHEN cur_PDCSample%NOTFOUND;
SELECT COUNT(*)
INTO v_sampleCount
FROM TBL_BIL
WHERE UDF_TOKENIZED = rec_Pdcsample.UDF_TOKENIZED;
IF v_sampleCount <> 0 THEN
UPDATE TBL_BIL
SET UDF_CHK = 'Y'
WHERE UDF_TOKENIZED = rec_Pdcsample.UDF_TOKENIZED;
IF v_sampleCount > 1 THEN
v_sampleFlag := 1;
ELSE
IF v_sampleCount = 1 THEN
v_sampleFlag := 2;
ELSE
v_sampleFlag := 0;
END IF;
END IF;
UPDATE TBL_BIL
SET UDF_SAMPLECOUNT = v_sampleCount, UDF_SAMPLEFLAG = v_sampleFlag
WHERE uniqueid = rec_Pdcsample.uniqueid;
UPDATE TBL_BIL
SET UDF_PID = rec_Pdcsample.uniqueid
WHERE UDF_TOKENIZED = rec_Pdcsample.UDF_TOKENIZED;
UPDATE TBL_BIL
SET UDF_PIDSPEND = v_SpendAmount
WHERE uniqueid = rec_Pdcsample.uniqueid;
UPDATE TBL_BIL
SET UDF_MATCHPERCENT = 1
WHERE uniqueid <> rec_Pdcsample.uniqueid
AND UDF_TOKENIZED = rec_Pdcsample.UDF_TOKENIZED;
END IF;
IF cur_PDCSample%ISOPEN THEN
CLOSE cur_PDCSample;
END IF;
OPEN cur_PDCSample;
END LOOP;
IF cur_PDCSample%ISOPEN THEN
CLOSE cur_PDCSample;
END IF;
end PrcdrClustering;
执行需要几天时间,我的表有225,846行数据。
我桌子的结构是: -
UNIQUEID NUMBER Notnull primary key
VENDORNAME VARCHAR2(200)
SHORTTEXT VARCHAR2(500)
SPENDAMT NUMBER(18,2)
UDF_TOKENIZED VARCHAR2(999)
UDF_PID NUMBER(10)
UDF_SAMPLEFLAG NUMBER(4)
UDF_SAMPLECOUNT NUMBER(4)
UDF_MATCHPERCENT NUMBER(4)
UDF_TOKENCNT NUMBER(4)
UDF_PIDSPEND NUMBER(18,2)
UDF_CHK VARCHAR2(1)
答案 0 :(得分:4)
从哪里开始?我有很多要点。
bulk collect ... forall
会更有效率。 elsif
来减少要评估的陈述数量(非常非常小的胜利)uniqueid
是唯一的,您可以使用rowid
更新表格。udf_pidspend
更新为null,无论是否有意,都无需为其单独更新。tbl_bil
uniqueid
被编入索引
作为一般规则:
bulk collect
。 select *
rowid
避免所有索引问题。这只适用于11G,最近我回答this question,我提供了自己的方式来处理11G之前版本中的实施限制,并链接到Ollie's,Tom Kyte和{{3 }}
我不完全确定你在这里要做什么,所以请原谅我,如果逻辑稍微偏离。
create or replace procedure prcdr_Clustering is
cursor c_pdcsample is
select rowid as rid
, count(*) over ( partition by udf_tokenized ) as samplecount
, udf_chk
, max(uniqueid) over ( partition by udf_tokenized ) as udf_pid
from tbl_bil
where udf_chk = 'N';
type t__pdcsample is table of c_pdcsample%rowtype index by binary_integer;
t_pdcsample t__pdcsample;
begin
open c_pdcsample;
loop
fetch c_pdcsample bulk collect into t_pdcsample limit 1000;
exit when t_pdcsample.count = 0;
if t_pdcsample.samplecount <> 0 then
t_pdcsample.udf_chk := 'y';
if t_pdcsample.samplecount > 1 then
t_pdcsample.samplecount := 1;
elsif t_pdcsample.samplecount = 1 then
t_pdcsample.samplecount := 2;
else
t_pdcsample.samplecount := 0;
end if;
end if;
forall i in t_pdcsample.first .. t_pdcsample.last
update tbl_bil
set udfsamplecount = t_pdcsample.samplecount
, udf_sampleflag = t_pdcsample.sampleflag
, udf_pidspend = null
, udf_pid = t_pdcsample.udf_pid
where rowid = t_pdcsample(i).rowid
;
for i in t_pdcsample.first .. t_pdcsample.last loop
update tbl_bil TBL_BIL
set udfmatchpercent = 1
where uniqueid <> t_pdcsample.uniqueid
and udf_tokenized = t_pdcsample.udf_tokenized;
end loop;
commit ;
end loop;
close c_pdcsample;
end PrcdrClustering;
/
最后调用所有表tbl_...
有点不必要。
答案 1 :(得分:4)
这是使用单个SQL语句的变体。我不是100%肯定逻辑是完全相同的,但对于我的测试集,它是。当你有多个记录时,当udf_chk ='N'并且udf_tokenized相同时,当前程序是不确定的......
这是重构过程
SQL> create procedure prcdr_clustering_refactored
2 is
3 begin
4 merge into tbl_bil t
5 using ( select tb1.uniqueid
6 , count(*) over (partition by tb1.udf_tokenized) cnt
7 , max(decode(udf_chk,'N',uniqueid)) over (partition by tb1.udf_tokenized order by tb1.udf_chk) pid
8 from tbl_bil tb1
9 where udf_chk = 'N'
10 or exists
11 ( select 'dummy'
12 from tbl_bil tb2
13 where tb2.udf_tokenized = tb1.udf_tokenized
14 )
15 ) q
16 on ( t.uniqueid = q.uniqueid )
17 when matched then
18 update
19 set t.udf_samplecount = decode(t.udf_chk,'N',q.cnt,t.udf_samplecount)
20 , t.udf_sampleflag = decode(t.udf_chk,'N',decode(q.cnt,1,2,1),t.udf_sampleflag)
21 , t.udf_pid = q.pid
22 , t.udf_pidspend = decode(t.udf_chk,'N',null,t.udf_pidspend)
23 , t.udf_matchpercent = decode(t.udf_chk,'N',t.udf_matchpercent,1)
24 , t.udf_chk = 'Y'
25 ;
26 end;
27 /
Procedure created.
这是一个测试:
SQL> select *
2 from tbl_bil
3 order by uniqueid
4 /
UNIQUEID VENDORNAME SHORTTEXT SPENDAMT UDF_TOKENI UDF_PID UDF_SAMPLEFLAG UDF_SAMPLECOUNT UDF_MATCHPERCENT UDF_TOKENCNT UDF_PIDSPEND U
-------- ---------- ---------- -------- ---------- ------- -------------- --------------- ---------------- ------------ ------------ -
1 a a 1 bl 0 0 0 0 0 0 N
2 a a 1 bla 0 0 0 0 0 0 N
3 a a 1 bla 0 0 0 0 0 0 Y
4 a a 1 bla 0 0 0 0 0 0 Y
5 a a 1 bla 0 0 0 0 0 0 Y
6 a a 1 blah 0 0 0 0 0 0 N
7 a a 1 blah 0 0 0 0 0 0 Y
8 a a 1 blah 0 0 0 0 0 0 Y
9 a a 1 blah 0 0 0 0 0 0 Y
10 a a 1 blah 0 0 0 0 0 0 Y
11 a a 1 blah 0 0 0 0 0 0 Y
11 rows selected.
SQL> exec prcdr_clustering
PL/SQL procedure successfully completed.
SQL> select *
2 from tbl_bil
3 order by uniqueid
4 /
UNIQUEID VENDORNAME SHORTTEXT SPENDAMT UDF_TOKENI UDF_PID UDF_SAMPLEFLAG UDF_SAMPLECOUNT UDF_MATCHPERCENT UDF_TOKENCNT UDF_PIDSPEND U
-------- ---------- ---------- -------- ---------- ------- -------------- --------------- ---------------- ------------ ------------ -
1 a a 1 bl 1 2 1 0 0 Y
2 a a 1 bla 2 1 4 0 0 Y
3 a a 1 bla 2 0 0 1 0 0 Y
4 a a 1 bla 2 0 0 1 0 0 Y
5 a a 1 bla 2 0 0 1 0 0 Y
6 a a 1 blah 6 1 6 0 0 Y
7 a a 1 blah 6 0 0 1 0 0 Y
8 a a 1 blah 6 0 0 1 0 0 Y
9 a a 1 blah 6 0 0 1 0 0 Y
10 a a 1 blah 6 0 0 1 0 0 Y
11 a a 1 blah 6 0 0 1 0 0 Y
11 rows selected.
SQL> rollback
2 /
Rollback complete.
SQL> exec prcdr_clustering_refactored
PL/SQL procedure successfully completed.
SQL> select *
2 from tbl_bil
3 order by uniqueid
4 /
UNIQUEID VENDORNAME SHORTTEXT SPENDAMT UDF_TOKENI UDF_PID UDF_SAMPLEFLAG UDF_SAMPLECOUNT UDF_MATCHPERCENT UDF_TOKENCNT UDF_PIDSPEND U
-------- ---------- ---------- -------- ---------- ------- -------------- --------------- ---------------- ------------ ------------ -
1 a a 1 bl 1 2 1 0 0 Y
2 a a 1 bla 2 1 4 0 0 Y
3 a a 1 bla 2 0 0 1 0 0 Y
4 a a 1 bla 2 0 0 1 0 0 Y
5 a a 1 bla 2 0 0 1 0 0 Y
6 a a 1 blah 6 1 6 0 0 Y
7 a a 1 blah 6 0 0 1 0 0 Y
8 a a 1 blah 6 0 0 1 0 0 Y
9 a a 1 blah 6 0 0 1 0 0 Y
10 a a 1 blah 6 0 0 1 0 0 Y
11 a a 1 blah 6 0 0 1 0 0 Y
11 rows selected.
的问候,
罗布。
答案 2 :(得分:0)
我不知道为什么,但你打开cur_PDCSample,选择(我怀疑)成千上万的记录。然后,在循环中,关闭光标并重新打开它,每次只处理返回的第一条记录。
如果您打开光标一次,处理每条记录然后关闭它,您的程序可能会更快。
实际上,由于您并不总是将TBL_BIL.UDF_CHK更新为“Y”,因此在我看来您当前的程序可能无限运行。