数据样本

Question

这里的动机是轻松准确地为nested ranges challenge生成数据样本。

表包含单列文本类型。文本包含一行或多行，其中每行包含一个或多个由字母创建的部分。目标是编写一个查询，为每个部分返回一个元组及其起点，终点和值。

数据样本

create table t (txt varchar (1000));

insert into t (txt) values
(
'
AAAAAAAAAAAAAAAAAAAAAAAAAAAA  BBBB    CCCCCCCCCCCCCCCCCCCCCCCCC
DDDE  FFFFFFFF    GGGGGGGGG               HHHHHHHH    IIIIIII
JJ      KKKLLL       MM NN                              OOOOO
            P                                              QQ
'
)
;

请求结果

*只需要最后3列（部分开始/结束/ val），其余部分用于调试目的。

line_ind    section_ind   section_length  section_start  section_end   section_val
1           1             28              1              28            A
1           2             4               31             34            B
1           3             25              39             63            C
2           1             3               1              3             D
2           2             1               4              4             E
2           3             8               7              14            F
2           4             9               19             27            G
2           5             8               43             50            H
2           6             7               55             61            I
3           1             2               1              2             J
3           2             3               9              11            K
3           3             3               12             14            L
3           4             2               22             23            M
3           5             2               25             26            N
3           6             5               57             61            O
4           1             1               13             13            P
4           2             2               60             61            Q

Answer 1

的Teradata

目前 regexp_split_to_table 似乎不支持零长度表达式（我已创建事件RECGZJKZV）。为了克服这个限制，我使用 regexp_replace 来推动相邻字母序列之间的空间，例如 KKKLLL

with        l
            as
            (
                select      line_ind
                           ,line

                from        table
                            (
                                regexp_split_to_table (-1,t.txt,'\r','')
                                returns (minus_one int,line_ind int,line varchar(1000))
                            )   
                            as l
            )

select      l.line_ind
           ,r.section_ind                                           
           ,char_length     (r.section)                                 as section_length
           ,regexp_instr    (l.line,'(\S)\1*',1,r.section_ind,0)        as section_start
           ,regexp_instr    (l.line,'(\S)\1*',1,r.section_ind,1) - 1    as section_end
           ,substr          (r.section,1,1)                             as section_val

from        table
            (
                regexp_split_to_table (l.line_ind,regexp_replace (l.line,'(?<=(?P<c>.))(?!(?P=c))',' '),'\s+','')
                returns (line_ind int,section_ind int,section varchar(1000))
            )  
            as r
           ,l

where       l.line_ind  =
            r.line_ind

order by    l.line_ind
           ,r.section_ind   
;

的Oracle

select      regexp_instr  (txt,'(\S)\1*',1,level,0)       - instr (txt,chr(10),regexp_instr (txt,'(\S)\1*',1,level,0) - length (txt) - 1,1)   as section_start
           ,regexp_instr  (txt,'(\S)\1*',1,level,1) - 1   - instr (txt,chr(10),regexp_instr (txt,'(\S)\1*',1,level,0) - length (txt) - 1,1)   as section_end
           ,regexp_substr (txt,'(\S)\1*',1,level,'',1)                                                                                        as section_val

from        t

connect by  level <= regexp_count (txt,'(\S)\1*')
;

Answer 2

Oracle

即使您有多个输入行，这也将起作用：

WITH lines ( txt, id, line, pos, line_no ) AS(
  SELECT txt,
         id,
         REGEXP_SUBSTR( txt, '.*', 1, 1 ),
         REGEXP_INSTR( txt, '.*', 1, 1, 1 ),
         1
  FROM   t
UNION ALL
  SELECT txt,
         id,
         REGEXP_SUBSTR( txt, '.*', pos + 1, 1 ),
         REGEXP_INSTR( txt, '.*', pos + 1, 1, 1 ),
         line_no + 1
  FROM   lines
  WHERE  pos > 0
),
words ( id, line, line_no, section_start, section_end, section_value ) AS (
  SELECT id,
         line,
         line_no,
         REGEXP_INSTR( line, '(\S)\1*', 1, 1, 0 ),
         REGEXP_INSTR( line, '(\S)\1*', 1, 1, 1 ) - 1,
         REGEXP_SUBSTR( line, '(\S)\1*', 1, 1, NULL, 1 )
  FROM   lines
  WHERE  pos > 0
  AND    line IS NOT NULL
UNION ALL
  SELECT id,
         line,
         line_no,
         REGEXP_INSTR( line, '(\S)\1*', section_end + 1, 1, 0 ),
         REGEXP_INSTR( line, '(\S)\1*', section_end + 1, 1, 1 ) - 1,
         REGEXP_SUBSTR( line, '(\S)\1*', section_end + 1, 1, NULL, 1 )
  FROM   words
  WHERE  section_end > 0
)
SELECT id,
       line_no,
       section_start,
       section_end,
       section_value
FROM   words
WHERE  section_end > 0
ORDER BY id, line_no, section_start

因此，对于输入数据（添加了id列以便能够轻松区分文本片段）

create table t (id NUMBER(5,0), txt varchar (1000));

insert into t (id, txt) values
(
1,
'
AAAAAAAAAAAAAAAAAAAAAAAAAAAA  BBBB    CCCCCCCCCCCCCCCCCCCCCCCCC
DDDE  FFFFFFFF    GGGGGGGGG               HHHHHHHH    IIIIIII
JJ      KKKLLL       MM NN                              OOOOO
            P                                              QQ
'
);

insert into t (id, txt) values ( 2, 'RRRSTT UUU    V WXYZ' );

这将输出：

ID | LINE_NO | SECTION_START | SECTION_END | SECTION_VALUE
-: | ------: | ------------: | ----------: | :------------
 1 |       2 |             1 |          28 | A            
 1 |       2 |            31 |          34 | B            
 1 |       2 |            39 |          63 | C            
 1 |       3 |             1 |           3 | D            
 1 |       3 |             4 |           4 | E            
 1 |       3 |             7 |          14 | F            
 1 |       3 |            19 |          27 | G            
 1 |       3 |            43 |          50 | H            
 1 |       3 |            55 |          61 | I            
 1 |       4 |             1 |           2 | J            
 1 |       4 |             9 |          11 | K            
 1 |       4 |            12 |          14 | L            
 1 |       4 |            22 |          23 | M            
 1 |       4 |            25 |          26 | N            
 1 |       4 |            57 |          61 | O            
 1 |       5 |            13 |          13 | P            
 1 |       5 |            60 |          61 | Q            
 2 |       1 |             1 |           3 | R            
 2 |       1 |             4 |           4 | S            
 2 |       1 |             5 |           6 | T            
 2 |       1 |             8 |          10 | U            
 2 |       1 |            15 |          15 | V            
 2 |       1 |            17 |          17 | W            
 2 |       1 |            18 |          18 | X            
 2 |       1 |            19 |          19 | Y            
 2 |       1 |            20 |          20 | Z

db <>提琴here

SQL / REGEX拼图/挑战如何将具有多个字符的ASCII艺术范围转换为关系数据？

数据样本

请求结果

2 个答案:

的Teradata

的Oracle

Oracle