Question

我有一个看起来像 -

的字符串

12361_BBMS_GTECHL|12362_BBMS_PRIM|12363_BBMS_SEC|....and so on

所以我需要获取

12361 and BBMS_GTECHL
12362 and BBMS_PRIM
12363 and BBMS_SEC

我用过 -

select *
  FROM
  TABLE(XMLSEQUENCE(
        EXTRACT(
            XMLTYPE('<rowset><row><Code>'||
                replace(replace('12361=BBMS_GTECHL|12362=BBMS_PRIM','|','</Value></row><row><Code>'),'=','</Code><Value>')||'</Value>'||'</row></rowset>'),'/rowset/row')));


declare
  l_val varchar2(1000);
begin
  select substr('12361_BBMS_GTECHL|12362_BBMS_PRIM', instr('|')+1)  into     l_val from dual;
  dbms_output.put_line(l_val);
end;

但是在获得理想的结果方面遇到了问题！我需要在一个包中编写这个逻辑，如果我在这里得到一些提示，我将会这样做。

我的数据库版本是 -

 Oracle Database 12c Enterprise Edition Release 12.1.0.2.0 - 64bit Production

Answer 1

这是使用递归因式子查询（“递归CTE”）的解决方案。请注意使用指向管道符号位置的指针和每个管道后面的第一个下划线（忽略其他下划线）。此外，该解决方案仅使用标准INSTR和SUBSTR，避免使用正则表达式（执行速度稍慢 - 如果处理大量数据，则很重要）。

with input_data (input_str) as (
          select '12361_BBMS_GTECHL|12362_BBMS_PRIM|12363_BBMS_SEC' from dual
     ),
     t (str) as (
        select '|' || input_str || '|' from input_data
     ),
     r (lvl, code, descr, str, p1_from, p2_from, p1_to, p2_to) as (
        select 0, null, null, str, 1, 1, instr(str, '_', 1, 1), instr(str, '|', 1, 2) 
          from t
        union all
        select lvl+1, substr(str, p2_from + 1, p1_to - p2_from - 1), 
                      substr(str, p1_to + 1, p2_to - p1_to - 1),
                      str, p1_to, p2_to, instr(str, '_', p2_to + 1, 1), 
                                         instr(str, '|', p2_to + 1, 1) 
          from r
          where p1_to != 0
     )
select code, descr
from   r
where  lvl != 0;

<强>输出：

CODE    DESCR
------- --------------------
12361   BBMS_GTECHL
12362   BBMS_PRIM
12363   BBMS_SEC

Answer 2

如果我是你，我的主要考虑因素是性能，我会使用表格函数。 mathguys解决方案可以很好地工作，但如果我们使用流水线函数，它会更高效。

首先，我们创建我们的功能所必需的类型。

drop type type_test_table;

drop type type_test_row;

CREATE TYPE type_test_row AS OBJECT (
  code           varchar2(2000),
  descr  VARCHAR2(50)
)
/

CREATE TYPE type_test_table IS TABLE OF type_test_row
/

然后我们创建我们的函数：

create or replace function test_pipe_func return type_test_table pipelined as
cursor c_data_in is 
select '12361'||level||'_BBMS_GTECHL'||level||'|12362'||level||'_BBMS_PRIM'||level||'|12363'||level||'_BBMS_SEC'||level||'|12364'||level||'_BBU_SEC'||level as str from dual
connect by level <= 1000000;
v_element varchar2(300);
v_code varchar2(100);
v_descr varchar2(200);
p_deb number;
p_fin number;
begin 
    for l_data_in in c_data_in loop
        p_deb := 0;
        p_fin := 1;
        while p_fin > 0 loop
            p_fin := case when p_deb = 0 then instr(l_data_in.str,'|',1, 1) else instr(l_data_in.str,'|',p_deb-1, 2) end;
            p_deb := case when p_deb = 0 then 1 else instr(l_data_in.str,'|',p_deb-1, 1)+1 end;
            v_element := case when p_fin = 0 then substr(l_data_in.str, p_deb) else substr(l_data_in.str, p_deb, p_fin - p_deb) end;
            p_deb := p_fin +1;
            v_code := substr(v_element, 1 , instr(v_element, '_' , 1,1)-1);
            v_descr := substr(v_element, instr(v_element, '_' , 1,1)+1);
            pipe row(type_test_row(v_code, v_descr));
        end loop;

    end loop;
end test_pipe_func;
/

我稍微更改了测试用例，以便能够为我的测试生成尽可能多的行。我使用流水线功能来限制大数据集的进程内存使用，并能够与select一起使用。如果您的用例不同（我不知道可能使用输入插入表格）另一个选项可能是使用批量收集和forall。

create or replace procedure test_bulk_collect_proc as
cursor c_data_in is 
select '12361'||level||'_BBMS_GTECHL'||level||'|12362'||level||'_BBMS_PRIM'||level||'|12363'||level||'_BBMS_SEC'||level as str from dual
connect by level <= 1000000;
type type_table_data_in is table of c_data_in%rowtype;
table_data_in type_table_data_in;

v_element varchar2(300);
v_code varchar2(100);
v_descr varchar2(200);
p_deb number;
p_fin number;
v_str varchar2(4000);
v_t_insr type_test_table;

limit_in number := 100000;
i number;
begin 
   OPEN c_data_in;
    LOOP
        FETCH c_data_in BULK COLLECT INTO table_data_in LIMIT limit_in;
        v_t_insr := type_test_table();
        i := 1;
        for indx IN 1 .. table_data_in.COUNT LOOP
            v_str := table_data_in(indx).str;
            p_deb := 0;
            p_fin := 1;
            while p_fin > 0 loop
                p_fin := case when p_deb = 0 then instr(v_str,'|',1, 1) else instr(v_str,'|',p_deb-1, 2) end;
                p_deb := case when p_deb = 0 then 1 else instr(v_str,'|',p_deb-1, 1)+1 end;
                v_element := case when p_fin = 0 then substr(v_str, p_deb) else substr(v_str, p_deb, p_fin - p_deb) end;
                p_deb := p_fin +1;
                v_code := substr(v_element, 1 , instr(v_element, '_' , 1,1)-1);
                v_descr := substr(v_element, instr(v_element, '_' , 1,1)+1);
                v_t_insr.extend;
                v_t_insr(i) := type_test_row(v_code, v_descr);
                i:= i+1;
            end loop;
        END LOOP;

        forall t in v_t_insr.first..v_t_insr.last
        insert into test_bbu(CODE, DESCR) values (v_t_insr(t).code, v_t_insr(t).descr);

        EXIT WHEN table_data_in.COUNT < limit_in;

   END LOOP;
End;
/

我在我的数据库上测试了所有三种方法。为了测试mathguy的sql和我使用CTAS的流水线函数，并且我只是执行了程序。

create table test_bbu as 
with input_data (input_str) as (
              select '12361'||level||'_BBMS_GTECHL'||level||'|12362'||level||'_BBMS_PRIM'||level||'|12363'||level||'_BBMS_SEC'||level from dual
          connect by level <= 1000000
     ),
     t (str) as (
        select '|' || input_str || '|' from input_data
     ),
     r (lvl, code, descr, str, p1_from, p2_from, p1_to, p2_to) as (
        select 0, null, null, str, 1, 1, instr(str, '_', 1, 1), instr(str, '|', 1, 2) 
          from t
        union all
        select lvl+1, substr(str, p2_from + 1, p1_to - p2_from - 1), 
                      substr(str, p1_to + 1, p2_to - p1_to - 1),
                      str, p1_to, p2_to, instr(str, '_', p2_to + 1, 1), 
                                         instr(str, '|', p2_to + 1, 1) 
          from r
          where p1_to != 0
     )
select code, descr
from   r
where  lvl != 0;

create table test_bbu2 as
select * from table(test_pipe_func);

execute test_bulk_collect_proc;

我用500K和1M线测试了三种方法。以下是我的结果，但我建议您在做出决定之前先测试一下环境。

                 500K          1M
----------------------------------------
SQL               36s          1m:15s
Pipelined         11s          23s
Bulk Collect      8s           17s

从管道分隔的字符串中提取值

2 个答案: