我需要处理很多文本字段。为了处理它们,我需要做的第一件事是规范化我处理的字符集。我需要输出字符串包含以下内容;
A-Z,0-9和空格,我希望所有小写都转换为大写。
所以我在pl / sql中使用以下内容;
X := UPPER(TRIM(REGEXP_REPLACE
(REGEXP_REPLACE(X, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
这很慢。什么会更快?
答案 0 :(得分:1)
您可以尝试这种方法,基于一些(非常)松散的测试,它看起来要快得多。它是一个本地编译的函数:
CREATE OR REPLACE function clean_string(
in_string in varchar2)
return varchar2 AS
out_string varchar2(4000) := '';
in_length number;
cnt number := 0;
in_char char(1);
out_char char(1);
dec_char number;
prev_space boolean := false;
begin
--dbms_output.put_line('In string: ' || in_string);
in_length := LENGTH(in_string);
while cnt < in_length
LOOP
cnt := cnt + 1;
in_char := substr(in_string, cnt, 1);
dec_char := ascii(in_char);
-- blank out non alphanumerics
IF (
(dec_char >= 48 AND dec_char <= 57) OR
(dec_char >= 65 AND dec_char <= 90) OR
(dec_char >= 97 AND dec_char <= 122)
) THEN
--keep it
out_char := in_char;
ELSE
out_char := ' ';
END IF;
IF (NOT(prev_space AND out_char = ' ')) THEN
out_string := out_string || out_char;
END IF;
<<endloop>>
IF (out_char = ' ') THEN
prev_space := true;
ELSE
prev_space := false;
END IF;
END LOOP;
return trim(upper(out_string));
end;
ALTER SESSION SET PLSQL_CODE_TYPE=NATIVE;
ALTER function clean_string COMPILE;
为了测试,我从表中抽取了500万行并清理了一些字符串:
set serveroutput on
declare
cursor sel_cur1 is
select name, clean_string(name) as cln_name,
address1, clean_string(address1) as cln_addr1,
address2, clean_string(address2) as cln_addr2,
city, clean_string(city) as cln_city,
state, clean_string(state) as cln_state,
postalcode, clean_string(postalcode) as cln_zip
from my_table
where rownum <= 5000000;
cursor sel_cur2 is
select name,
address1,
address2,
city,
state,
postalcode
from my_table
where rownum <= 5000000;
l_cnt integer := 0;
l_cln_name varchar2(100);
l_cln_addr1 varchar2(100);
l_cln_addr2 varchar2(100);
l_cln_city varchar2(100);
l_cln_state varchar2(100);
l_cln_zip varchar2(100);
l_interval interval day to second(4);
l_start timestamp;
l_end timestamp;
begin
l_start := systimestamp;
for rec in sel_cur2
loop
l_cnt := l_cnt + 1;
l_cln_name := clean_string(rec.name);
l_cln_addr1 := clean_string(rec.address1);
l_cln_addr2 := clean_string(rec.address2);
l_cln_city := clean_string(rec.city);
l_cln_state := clean_string(rec.state);
l_cln_zip := clean_string(rec.postalcode);
end loop;
l_end := systimestamp;
l_interval := l_end - l_start;
dbms_output.put_line('Procedural approach timing: ' || l_interval);
-------------------------------------------------
l_cnt := 0;
l_start := systimestamp;
for rec in sel_cur1
loop
-- cleaning already done in SQL
l_cnt := l_cnt + 1;
end loop;
l_end := systimestamp;
l_interval := l_end - l_start;
dbms_output.put_line('SQL approach timing: ' || l_interval);
-------------------------------------------------
l_cnt := 0;
l_start := systimestamp;
for rec in sel_cur2
loop
l_cnt := l_cnt + 1;
l_cln_name := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.name, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
l_cln_addr1 := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.address1, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
l_cln_addr2 := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.address2, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
l_cln_city := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.city, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
l_cln_state := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.state, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
l_cln_zip := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.postalcode, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
end loop;
l_end := systimestamp;
l_interval := l_end - l_start;
dbms_output.put_line('Existing approach timing: ' || l_interval);
end;
输出结果为:
Procedural approach timing: +00 00:02:04.0320
SQL approach timing: +00 00:02:49.4326
Existing approach timing: +00 00:05:50.1607
此外,本机编译似乎只能帮助处理过程(而不是从SQL查询中调用函数),但似乎比regexp_replace解决方案快得多。希望有所帮助。
答案 1 :(得分:1)
首先,我要说我并没有回答我自己的问题,但我接受了tbone的回答。提供这个答案的原因是评论不要让我发布我真正想要的内容。
我通过几次调整创建了一个与tbone几乎完全相同的函数,通过改变处理小写字符范围的方式摆脱了UPPER,并将数字更改为binary_integers。
FUNCTION CLEAN_STRING(IN_STRING in VARCHAR2) RETURN VARCHAR2
AS
OUT_STRING VARCHAR2(32767) := '';
IN_LENGTH BINARY_INTEGER;
CNT BINARY_INTEGER := 0;
IN_CHAR CHAR(1);
OUT_CHAR CHAR(1);
DEC_CHAR BINARY_INTEGER;
PREV_SPACE BOOLEAN := FALSE;
BEGIN
IN_LENGTH := LENGTH(IN_STRING);
WHILE CNT < IN_LENGTH
LOOP
CNT := CNT + 1;
IN_CHAR := SUBSTR(IN_STRING, CNT, 1);
DEC_CHAR := ASCII(IN_CHAR);
-- blank out non alphanumerics
IF ((DEC_CHAR >= 48 AND DEC_CHAR <= 57) OR
(DEC_CHAR >= 65 AND DEC_CHAR <= 90))
THEN
--keep it
OUT_CHAR := IN_CHAR;
ELSE
IF (DEC_CHAR >= 97 AND DEC_CHAR <= 122)
THEN
OUT_CHAR := CHR(DEC_CHAR - 32);
ELSE
OUT_CHAR := ' ';
END IF;
END IF;
IF (NOT(PREV_SPACE AND OUT_CHAR = ' '))
THEN
OUT_STRING := OUT_STRING || OUT_CHAR;
END IF;
<<endloop>>
IF (OUT_CHAR = ' ') THEN
PREV_SPACE := TRUE;
ELSE
PREV_SPACE := FALSE;
END IF;
END LOOP;
RETURN TRIM(OUT_STRING);
END CLEAN_STRING;
然后我创建了一个像tbone一样的简单测试装备,但我测试了三种不同的例程。首先,我验证它们都返回相同的结果,然后计算每个例程的时间。这是试验台;
set serveroutput on
DECLARE
CURSOR PATHMAST_CURS
IS
SELECT PATHMAST_TEXT_DIAGNOSIS FROM PATHMAST WHERE ROWNUM < 100000;
DUMMY CLOB;
DUMMY_1 CLOB;
DUMMY_2 CLOB;
l_interval interval day to second(4);
l_start timestamp;
l_end timestamp;
diff_count_1 binary_integer := 0;
diff_count_2 binary_integer := 0;
BEGIN
FOR PATH_REC IN PATHMAST_CURS
LOOP
DUMMY := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(NVL(PATH_REC.PATHMAST_TEXT_DIAGNOSIS,' '), '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
DUMMY_1 := pathmast_utility_3.CLEAN_STRING(NVL(PATH_REC.PATHMAST_TEXT_DIAGNOSIS,' '));
DUMMY_2 := regexp_replace(trim(translate(NVL(PATH_REC.PATHMAST_TEXT_DIAGNOSIS,' '),'abcdefghijklmnopqrstuvwxyz`~!@#$%^&*()''_+-={[}]|/\":;,.<>?µ’±€'||chr(9),'ABCDEFGHIJKLMNOPQRSTUVWXYZ ')),'( )* ',' ');
IF DUMMY_1 != DUMMY
THEN
diff_count_1 := diff_count_1 + 1;
END IF;
IF DUMMY_2 != DUMMY
THEN
diff_count_2 := diff_count_2 + 1;
dbms_output.put_line('Regexp: ' || DUMMY);
dbms_output.put_line('Translate: ' || DUMMY_2);
END IF;
END LOOP;
dbms_output.put_line('CLEAN_STRING differences: ' || diff_count_1);
dbms_output.put_line('Translate differences: ' || diff_count_2);
l_start := systimestamp;
FOR PATH_REC IN PATHMAST_CURS
LOOP
DUMMY := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(PATH_REC.PATHMAST_TEXT_DIAGNOSIS, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
END LOOP;
l_end := systimestamp;
l_interval := l_end - l_start;
dbms_output.put_line('Regexp approach timing: ' || l_interval);
-------------------------------------------------
l_start := systimestamp;
FOR PATH_REC IN PATHMAST_CURS
LOOP
DUMMY := pathmast_utility_3.CLEAN_STRING(PATH_REC.PATHMAST_TEXT_DIAGNOSIS);
END LOOP;
l_end := systimestamp;
l_interval := l_end - l_start;
dbms_output.put_line('CLEAN_STRING approach timing: ' || l_interval);
-------------------------------------------------
l_start := systimestamp;
FOR PATH_REC IN PATHMAST_CURS
LOOP
DUMMY := regexp_replace(trim(translate(NVL(PATH_REC.PATHMAST_TEXT_DIAGNOSIS,' '),'abcdefghijklmnopqrstuvwxyz`~!@#$%^&*()''_+-={[}]|/\":;,.<>?µ’±€'||chr(9),'ABCDEFGHIJKLMNOPQRSTUVWXYZ ')),'( )* ',' ');
END LOOP;
l_end := systimestamp;
l_interval := l_end - l_start;
dbms_output.put_line('TRANSLATE approach timing: ' || l_interval);
-------------------------------------------------
END;
以下是结果;
anonymous block completed
CLEAN_STRING differences: 0
Translate differences: 0
Regexp approach timing: +00 00:00:52.9160
CLEAN_STRING approach timing: +00 00:00:05.5220
TRANSLATE approach timing: +00 00:00:13.4320
这一切都没有编译本机。所以tbone是最大的赢家。谢谢tbone。
如果出于任何原因您希望/需要使用翻译版本,您应该以编程方式构建翻译字符串以获取所有特殊字符。
答案 2 :(得分:0)
也许,您可以使用TRANSLATE而不是正则表达式来删除特殊字符并将小写字母转换为大写字母。
regexp_replace(
trim(
translate(x,
'abcdefghijklmnopqrstuvwxyz`~!@#$%^&*()_+-={[}]|/\"'':;,.<>?',
'ABCDEFGHIJKLMNOPQRSTUVWXYZ '
)
),
' {2,}',
' '
)
在一个包含1000行和一列的表格中尝试使用1到4000之间的随机字符。 导致时间减少约35%。(没有尝试使用PLSQL)。