如何使用双引号从csv跳过逗号

时间:2016-02-19 07:40:48

标签: sql oracle csv plsql

从csv文件中读取数据后,我收到以下格式的字符串

, Inc

我只想将其转换为数组,同时包含6个值,stat之前的逗号需要转义。 任何人都可以建议在PL / SQL中最好的方法吗?

3 个答案:

答案 0 :(得分:3)

这与this question类似,但您的列表中有空元素;我在那里试过的其中一种模式的简单翻译会跳过这些:

var v_lastline varchar2(50);
exec :v_lastline := '29218368,8062115," Benedict Canyon Equities, Inc",CLS,,FAX';

select level as lvl,
  regexp_substr(:v_lastline, '("[^"]*"|[^,]+)', 1, level) as element
from dual
connect by level <= regexp_count(:v_lastline, '("[^"]*"|[^,]+)');

       LVL ELEMENT                                
---------- ----------------------------------------
         1 29218368                                
         2 8062115                                 
         3 " Benedict Canyon Equities, Inc"        
         4 CLS                                     
         5 FAX                                     

如果你能识别一个永远不会出现在数据中的特殊字符,那么你可以通过将每个逗号更改为逗号+字符,然后在拆分后删除它来将其放入空元素中来解决这个问题:

select level as lvl,
  replace(regexp_substr(replace(:v_lastline, ',', ',§'),
    '(§"[^"]*"|[^,]+)', 1, level), '§', null) as element
from dual
connect by regexp_substr(replace(:v_lastline, ',', ',§'),
  '(§"[^"]*"|[^,]+)', 1, level) is not null;

       LVL ELEMENT                                
---------- ----------------------------------------
         1 29218368                                
         2 8062115                                 
         3 " Benedict Canyon Equities, Inc"        
         4 CLS                                     
         5                                         
         6 FAX                                     

它是分割分隔字符串which is explained in detail here的常用方法的扩展。

  • replace(:v_lastline, ',', ',§')...,CLS,,FAX更改为...,§CLS,§,§FAX,其中§是您永远不会看到的角色。
  • regexp_substr(..., '(§"[^"]*"|[^,]+)', 1, level)使用正则表达式标记更新的值,该正则表达式查找任何双引号括起的值(现在也以特殊字符开头)非逗号;评估的顺序意味着引用部分内的逗号被忽略。
  • level是分层查询语法的一部分,其中:
  • connect by regexp_substr(<same value and pattern>) is not null只知道有多少令牌。
  • 最后replace(regexp_substr(...), , '§', null)删除了第一步中使用的特殊字符。

然后,您可以使用更高级别replace()删除双引号,并根据需要修剪空格。

你没有说出一个数组的意思,但你可以在PL / SQL中运行该查询并批量收集到一个集合中(如果你打算使用它)。例如,使用内置的ODCIVARCHAR2LIST集合类型:

set serveroutput on
declare
  v_lastline varchar2(50);
  v_array sys.odcivarchar2list;
begin
  v_lastline := '29218368,8062115," Benedict Canyon Equities, Inc",CLS,,FAX';

  select trim(replace(replace(
    regexp_substr(replace(:v_lastline, ',', ',§'),
      '(§"[^"]*"|[^,]+)', 1, level), '§', null), '"', null))
  bulk collect into v_array
  from dual
  connect by regexp_substr(replace(:v_lastline, ',', ',§'),
    '(§"[^"]*"|[^,]+)', 1, level) is not null;

  dbms_output.put_line('Number of elements: ' || v_array.count);
  for i in 1..v_array.count loop
    dbms_output.put_line('Index ' || i || ' has: ' || v_array(i));
  end loop;
end;
/

Number of elements: 6
Index 1 has: 29218368
Index 2 has: 8062115
Index 3 has: Benedict Canyon Equities, Inc
Index 4 has: CLS
Index 5 has: 
Index 6 has: FAX

对于多个空元素,这也(现在)有效:

exec :v_lastline := '29218368,8062115," Benedict Canyon Equities, Inc",,,,,,,CLS,,,,,FAX,,,,,,,,,,,,,,,,,,INVOICE';
select level as lvl,
  replace(regexp_substr(replace(:v_lastline, ',', ',§'),
    '(§"[^"]*"|[^,]+)', 1, level), '§', null) as element
from dual
connect by regexp_substr(replace(:v_lastline, ',', ',§'),
  '(§"[^"]*"|[^,]+)', 1, level) is not null;

       LVL ELEMENT                                
---------- ----------------------------------------
         1 29218368                                
         2 8062115                                 
         3 " Benedict Canyon Equities, Inc"        
         4                                         
...
         9                                         
        10 CLS                                     
        11                                         
...
        14                                         
        15 FAX                                     
        16                                         
...
        32                                         
        33 INVOICE                                 

答案 1 :(得分:0)

如果CSV的结构已修复,您可以尝试使用以下内容:

with text(text) as ( select '29218368,8062115," Benedict Canyon Equities, Inc",CLS,,FAX' from dual)
select level,
       trim(',' from 
                       case
                        when level in (1,2) then
                            regexp_substr(text, '(.*??)\,', 1, level)
                        when level = 3 then
                            regexp_substr(text, '"(.*??)"', 1, 1)
                        when level in (4,5) then
                            regexp_substr(text, '(.*??)\,', instr(text, '"', 1, 2), level -2) 
                        when level = 6 then       
                            regexp_substr(text, '\,([^\,]*)', instr(text, '"', 1, 2), 3)
                       end
            )
from text
connect by level <= 6

通过以不同方式处理每个部分,对CSV的结构做出了强有力的假设,但在我看来很难找到一个真正通用的解决方案。

答案 2 :(得分:0)

这是一个没有正则表达式的解决方案,首先创建两个辅助函数

/* CAR   select car('hello,world,bla') from dual --> hello */
create or replace function car(PI_STR in varchar2,
                               PI_SEPARATOR in varchar2 default ',')
  return varchar2 is l_pos number;
begin
  l_pos := instr(PI_STR, PI_SEPARATOR);

  if l_pos > 0 then
    return substr(PI_STR, 1, l_pos - 1);
  end if;

  return PI_STR;
end;

/* CDR select cdr('hello,world,bla') from dual --> world,bla */
create or replace function cdr(PI_STR in varchar2,
                               PI_SEPARATOR in varchar2 default ',')
  return varchar2 is l_pos number;
begin
  l_pos := instr(PI_STR, PI_SEPARATOR);

  if l_pos > 0 then
    return substr(PI_STR, l_pos + length(PI_SEPARATOR));
  end if;

  return '';
end;

现在:通过','提取并为每个结果连接下一个条目,如果找到转义字符到下一个转义字符:

create or replace type csv_col is table of varchar2(4000);

create or replace function get_columns(PI_STR in varchar2,
                                       PI_SEPARATOR in varchar2,
                                       PI_ESC_CHAR in varchar2)
  return csv_col pipelined is l_car varchar2(4000);
l_cdr varchar2(4000);
l_car_esc varchar2(4000);
begin
  l_car := car(PI_STR, PI_SEPARATOR);
  l_cdr := cdr(PI_STR, PI_SEPARATOR);
  -- check for escape char
  l_car_esc := cdr(l_car, PI_ESC_CHAR);
  if l_car_esc is not null then
    l_car := l_car_esc || PI_SEPARATOR || car(l_cdr, PI_ESC_CHAR);
    l_cdr := cdr(cdr(l_cdr, PI_ESC_CHAR), PI_SEPARATOR);
  end if;
  loop
    if l_car is null and l_cdr is null then
      exit;
    end if;

    pipe row(l_car);
    l_car     := car(l_cdr, PI_SEPARATOR);
    l_cdr     := cdr(l_cdr, PI_SEPARATOR);
    l_car_esc := cdr(l_car, PI_ESC_CHAR);
    if l_car_esc is not null then
      l_car := l_car_esc || PI_SEPARATOR || car(l_cdr, PI_ESC_CHAR);
      l_cdr := cdr(cdr(l_cdr, PI_ESC_CHAR), PI_SEPARATOR);
      dbms_output.put_line(l_car);
      dbms_output.put_line(l_cdr);
    end if;
  end loop;
end;

这样称呼:

select *
  from table(get_columns('29218368,8062115," Benedict Canyon Equities, Inc",CLS,,FAX',
                         ',',
                         '"'));

- &GT;结果

29218368
8062115
 Benedict Canyon Equities, Inc
CLS

FAX