测试用例架构和数据如下:
create table tmp
(
vals varchar(8),
mask varchar(8)
);
insert into tmp values ('12345678',' ');
insert into tmp values ('12_45678',' _ ');
insert into tmp values ('12345678',' _ ');
insert into tmp values ('92345678',' ');
insert into tmp values ('92345678',' _ ');
暂时忽略掩码列并假设存在特殊匹配功能:
select VALS from tmp where specialmatch(vals,'12345678');
应该产生:
VALS
12345678
12_45678
12345678
接下来,
select VALS from tmp where specialmatch(vals,'92345678');
应该产生:
VALS
92345678
92345678
接下来,
select VALS from tmp where specialmatch(vals,'_2345678');
应该产生:
VALS
12345678
12_45678
12345678
92345678
92345678
接下来,
select VALS from tmp where specialmatch(vals,'12945678');
应该产生:
VALS
12_45678
关于如何进行特殊匹配功能的任何想法?
我天真的方法是编写一个特殊的字符串比较udf(伪代码):
bool function specialmatch(str1,str2) DETERMINISITC
{
return false if either are null;
for each char1,char2 of str1,str2
{
if (char1<>char2 && char1<>'_' && char2<>'_') return false;
}
return true;
}
在进行比赛之前,需要将面罩覆盖在val上。
Ex:val ='1_345678',mask ='_ _'=&gt; 1_34567_并且匹配12345678和19345679但不匹配92345678.
但是如何利用索引,优化器等来实现这一目标......
答案 0 :(得分:0)
Oracle 10g确实有一个正则表达式函数,可以在这种情况下帮助你。 http://download.oracle.com/docs/cd/B19306_01/appdev.102/b14251/adfns_regexp.htm
此外,如果必须在数据库中执行此操作,则可以查看java存储过程。
我不知道在这种情况下可以帮助你的索引虽然_可以在任何地方出现,包括第一个字符。
答案 1 :(得分:0)
面具只是一个字符吗? 如果是这样,你可以用
之类的东西限制可能性select VALS from tmp
where specialmatch(vals,'12945678')
and (substr(vals,1,4) = substr('12945678',1,4)
or substr(vals,5) = substr('12945678',5));
然后在substr(vals,1,4)和substr(vals,5)上有基于函数的索引。 我似乎记得读到可能存在FBI没有得到最佳计划的问题,所以替代SQL将是
select VALS from tmp
where specialmatch(vals,'12945678')
and substr(vals,1,4) = substr('12945678',1,4)
union
select VALS from tmp
where specialmatch(vals,'12945678')
substr(vals,5) = substr('12945678',5));
答案 2 :(得分:0)
下一个建议。 简单选项:_是LIKE的单个字符匹配,因此简单的解决方案是
SELECT * FROM tmp WHERE vals LIKE v_param OR v_param LIKE vals;
每次都是全表扫描,但保存了SQL和PL / SQL层之间的切换
复杂选项 每个字符的substr上的位图索引。那种多索引凝灰岩是位图擅长的。位图是对具有大量更新的列或具有大量小插入的表的错误。
我已经建立了一个测试测试。首先,我已经将10,000个值加载到TMP中,几乎是随机生成的。不确定数据集的大小,或没有通配符,一个通配符或多个通配符的条目的比例。这将对结果产生重大影响。
create table tmp ( vals varchar(8), mask varchar(8));
insert into tmp
select new_val, translate(new_val,'0123456789','__________')
from
(select case
when rn_3 is not null then translate(val,'34','__')
when rn_5 is not null then translate(val,'2','_')
when rn_7 is not null then translate(val,'78','__')
when rn_11 is not null then translate(val,'12345','_____')
else val end new_val
from
(select lpad(trunc(dbms_random.value(1,99999999)),8,'0') val,
decode(mod(rownum,3),0,1) rn_3, decode(mod(rownum,5),0,1) rn_5,
decode(mod(rownum,7),0,1) rn_7, decode(mod(rownum,11),0,1) rn_11
from dual connect by level < 10000)
)
declare
cursor c_1 is
select case
when rn_3 is not null then translate(val,'34','__')
when rn_5 is not null then translate(val,'2','_')
when rn_7 is not null then translate(val,'78','__')
when rn_11 is not null then translate(val,'12345','_____')
else val end try_val
from
(select lpad(trunc(dbms_random.value(1,99999999)),8,'0') val,
decode(mod(rownum,3),0,1) rn_3, decode(mod(rownum,5),0,1) rn_5,
decode(mod(rownum,7),0,1) rn_7, decode(mod(rownum,11),0,1) rn_11
from dual connect by level < 1000);
v_cnt number;
v_start number;
v_end number;
begin
v_start := dbms_utility.get_time;
for c_rec in c_1 loop
select count(*) into v_cnt
from tmp
where (c_rec.try_val like vals or vals like c_rec.try_val);
end loop;
v_end := dbms_utility.get_time;
dbms_output.put_line('Meth 1 :'||(v_end - v_start));
v_start := dbms_utility.get_time;
for c_rec in c_1 loop
select count(*) into v_cnt from
(select * from (select * from tmp where mask = ' ') v1
where vals like c_rec.try_val
union all
select * from (select * from tmp where mask > ' ') v2
where vals like maskmerge(mask,c_rec.try_val));
end loop;
v_end := dbms_utility.get_time;
dbms_output.put_line('Meth 2 :'||(v_end - v_start));
end;
/
我将'双头LIKE'与面具合并进行了比较。在测试中,LIKE通常大约为200-250(一秒钟),而蒙版大约需要十倍的时间。正如我所说,它将在很大程度上取决于数据分布。
答案 3 :(得分:0)
我将表格“划分”为两个不同的集合,那些没有掩码的人(v1)和那些做(v2)的人
select * from (select * from tmp where mask = ' ') v1 where vals like :srch
union all
select * from (select * from tmp where mask > ' ') v2 where vals like maskmerge(mask,:srch);
现在,优化器说:
Operation Object Name Rows Bytes Cost
SELECT STATEMENT Optimizer Mode=ALL_ROWS 2 5
UNION-ALL
TABLE ACCESS BY INDEX ROWID SCHEMA.TMP 1 90 2
INDEX RANGE SCAN SCHEMA.I_TMP_MASK 1 1
TABLE ACCESS BY INDEX ROWID SCHEMA.TMP 1 90 3
INDEX RANGE SCAN SCHEMA.I_TMP_MASK 2 1
这是非常好的,即使我的:srch中有外卡,Oracle可以优化掉。
最后,即使没有提示,vals和mask cols上的标准索引也足以完成这一操作。测试10g。注意:我们使用union all,因为v1和v2总是互斥的。
供参考:
CREATE OR REPLACE FUNCTION maskmerge (A IN VARCHAR, B IN VARCHAR)
RETURN VARCHAR deterministic parallel_enable
IS
alen int;
blen int;
mlen int;
res varchar(4000);
ca char;
cb char;
BEGIN
if (a is null) then
return b;
end if;
if (b is null) then
return a;
end if;
alen:=length(a);
blen:=length(b);
if (alen<blen) then
mlen:=alen;
else
mlen:=blen;
end if;
for i in 1 .. mlen loop
ca:=substr(a,i,1);
cb:=substr(b,i,1);
if (ca='_' or cb='_') then
res:=res||'_';
elsif (ca=' ') then
res:=res||cb;
elsif (cb=' ') then
res:=res||ca;
else
res:=res||cb;
end if;
end loop;
return res;
END;
完整测试用例(具有典型数据分发):
-----------------------------------------------------------------
CREATE OR REPLACE FUNCTION maskmerge (A IN VARCHAR, B IN VARCHAR)
RETURN VARCHAR deterministic parallel_enable
IS
alen int;
blen int;
mlen int;
res varchar(4000);
ca char;
cb char;
BEGIN
if (a is null) then
return b;
end if;
if (b is null) then
return a;
end if;
alen:=length(a);
blen:=length(b);
if (alen<blen) then
mlen:=alen;
else
mlen:=blen;
end if;
for i in 1 .. mlen loop
ca:=substr(a,i,1);
cb:=substr(b,i,1);
if (ca='_' or cb='_') then
res:=res||'_';
elsif (ca=' ') then
res:=res||cb;
elsif (cb=' ') then
res:=res||ca;
else
res:=res||cb;
end if;
end loop;
return res;
END;
/
create table tmp
(
id int not null primary key,
ipv6address varchar(32) not null,
ipv6addressmask varchar(32) default (' ') not null
);
create sequence s_tmp;
create index i_tmp_addr on tmp(ipv6address);
create index i_tmp_mask on tmp(ipv6addressmask);
create or replace trigger t_i_tmp before insert on tmp referencing new as new old as old FOR EACH ROW
DECLARE
tmpVar tmp.id%TYPE;
begin
SELECT s_tmp.NEXTVAL INTO tmpVar FROM dual;
:new.id:=tmpVar;
end;
exec dbms_random.initialize(17809465);
insert into tmp (ipv6address)
select decode(trunc(dbms_random.value(0,2)),0,'20010db80000000000000000',1,'00000000000000000000ffff','00000000000000000000ffff')
||trim(to_char(dbms_random.value(0, 4294967296),'0000000x'))
as val from dual
connect by level <= 10000;
insert into tmp
SELECT * FROM
( SELECT * FROM tmp
ORDER BY dbms_random.value )
WHERE rownum <= 200;
insert into tmp values (null,'00000000000000000000ffff12345678',' ');
insert into tmp values (null,'00000000000000000000ffff12345678',' _ ');
insert into tmp values (null,'00000000000000000000ffff1234567_',' __');
--select * from tmp order by ipv6address
-- network redaction of ipv4
update tmp set ipv6addressmask=maskmerge(' ______ ',ipv6addressmask),ipv6address=maskmerge(' ______ ',ipv6address) where length(ipv6address)/32*dbms_random.value<0.005;
-- host redaction of ipv4
update tmp set ipv6addressmask=maskmerge(' __',ipv6addressmask),ipv6address=maskmerge(' __',ipv6address) where length(ipv6address)/32*dbms_random.value<0.005;
-- full redaction of ipv4
update tmp set ipv6addressmask=maskmerge(' __',ipv6addressmask),ipv6address=maskmerge(' __',ipv6address) where ipv6addressmask=' ______ ' and length(ipv6address)/32*dbms_random.value<0.04;
-- network report redaction of ipv4
update tmp set ipv6addressmask=maskmerge(' ______ ',ipv6addressmask) where length(ipv6address)/32*dbms_random.value<0.005;
-- host report redaction of ipv4
update tmp set ipv6addressmask=maskmerge(' __',ipv6addressmask) where length(ipv6address)/32*dbms_random.value<0.005;
-- full report redaction of ipv4
update tmp set ipv6addressmask=maskmerge(' __',ipv6addressmask) where ipv6addressmask=' ______ ' and length(ipv6address)/32*dbms_random.value<0.04;
select count(*) from tmp where instr(ipv6address,'_')>0;
select count(*) from tmp where ipv6addressmask > ' ';
-- srch := '00000000000000000000ffff12345678';
select * from (select * from tmp where ipv6addressmask = ' ') v1 where ipv6address like :srch
union all
select * from (select * from tmp where ipv6addressmask > ' ') v2 where ipv6address like maskmerge(ipv6addressmask,:srch);
/*
Operation Object Name Rows Bytes Cost
---------------------------------------- ----------- ---- ----- ----
SELECT STATEMENT Optimizer Mode=ALL_ROWS 510 29
UNION-ALL
TABLE ACCESS BY INDEX ROWID TMP 500 23K 10
INDEX RANGE SCAN I_TMP_ADDR 92 2
TABLE ACCESS BY INDEX ROWID TMP 10 490 19
INDEX RANGE SCAN I_TMP_MASK 207 2
*/
SELECT * FROM tmp WHERE ipv6address LIKE :srch OR :srch LIKE ipv6address
/*
Operation Object Name Rows Bytes Cost
---------------------------------------- ----------- ---- ----- ----
SELECT STATEMENT Optimizer Mode=ALL_ROWS 995 22
TABLE ACCESS FULL TMP 995 47K 22
*/
-----------------------------------------------------------------
drop table tmp;
drop sequence s_tmp;
drop function maskmerge;
-----------------------------------------------------------------
答案 4 :(得分:0)
FYI, 我发现当你需要对%var%进行匹配并让它对大量数据快速工作时,最好的方法是使用Oracle的Oracle Text Indexes。