这是SQLFiddle demo

Question

我有愚蠢的表和数据。我需要： 1-将每行中的每个句子分成新行根据soundex函数

，根据句子的最后部分对每行中的单词进行计数

create table a (id number(9), words varchar(500));
insert into a values(1,'UK,LONDON,YEMEN,JOHN,CAIRO,OMAR ALI,EGYPT,Cairo,YEMAN,OMAR AMR ALI,LONDAN');
insert into a values(2,'UK,SUDAI,SUDAIN,AYHAM SHAHER YAFOOZ,ALI YAFOOZ');
insert into a values(3,'MALAYSIA, AHMED ALI,MALYSIAN');

exxted output

create table temp_words（id number（9），words varchar2（100），count_words number（9））;

id            words                count_words
1              UK                     1
1            LONDON                   2
1            YEMEN                    2
1            CAIRO                    2
1            OMAR ALI                 2
1             JOHN                    1
2              UK                     1
2              SUDAI                  2
2          AYHAM SHAHER YAFOOZ        2
3               MALAYSIA              2
3              AHMED ALI              1

问候

Answer 1

根据需要拆分数据，可以使用“connect by”作为行生成器。

SQL> with src as (select id,',' || words || ',' as words,
  2                      length(words) - length(translate(words, '.,', '.')) + 1 no_of_words
  3                 from a)
  4  select a.id,
  5         substr(a.words,
  6                instr(words, ',', 1, r) + 1,
  7                instr(words, ',', 1, r + 1) - instr(words, ',', 1, r) - 1) word,
  8         a.no_of_words
  9    from (select level r
 10            from dual
 11          connect by level <= (select max(no_of_words) from src)) d
 12         inner join src a
 13                on d.r <= a.no_of_words
 14   where a.no_of_words is not null
 15   order by a.id, d.r
 16  /

        ID WORD                 NO_OF_WORDS
---------- -------------------- -----------
         1 UK                            11
         1 LONDON                        11
         1 YEMEN                         11
         1 JOHN                          11
         1 CAIRO                         11
         1 OMAR ALI                      11
         1 EGYPT                         11
         1 Cairo                         11
         1 YEMAN                         11
         1 OMAR AMR ALI                  11
         1 LONDAN                        11
         2 UK                             5
         2 SUDAI                          5
         2 SUDAIN                         5
         2 AYHAM SHAHER YAFOOZ            5
         2 ALI YAFOOZ                     5
         3 MALAYSIA                       3
         3  AHMED ALI                     3
         3 MALYSIAN                       3

19 rows selected.

SQL>

Answer 2

这是SQLFiddle demo

select id,words,

case when i=0 then
SUBSTR(words,
   1,
case when INSTR(words,',', 1, 1)=0 
       then 100000
       else
       INSTR(words,',', 1, 1)-1
       end

   )    
ELSE

SUBSTR(words,
   INSTR(words,',', 1, i)+1,

   case when INSTR(words,',', 1, i+1)=0 
       then 100000
       else
       INSTR(words,',', 1, i+1)-INSTR(words,',', 1, i)-1 
       end

   )
END word,
i+1 COUNTWORDS

from a,
(
select * from
(
select 0 i from dual 
union 
select 1 i from dual 
union 
select 2 i from dual 
union 
select 3 i from dual
union 
select 4 i from dual 
union 
select 5 i from dual 
union 
select 6 i from dual
union 
select 7 i from dual
union 
select 8 i from dual
union 
select 9 i from dual
union 
select 10 i from dual
union 
select 11 i from dual
union 
select 12 i from dual
 )
  )
table_i

where 
  case when i>0 then INSTR(words,',', 1, i) 
  else 100000 end <>0 
order by id,i

Answer 3

另一种方法（使用regexp_count和regexp_substr正则表达式函数）：

SQL> with Occurence(oc) as(
  2    select level
  3      from ( select max(regexp_count(words, '[^,]+')) ml
  4               from a
  5            ) t
  6     connect by level <= t.ml
  7  )
  8  select id
  9       , word
 10       , count(word) over(partition by id, soundex(word) order by id) as count_words
 11    From ( select a.id
 12                , regexp_substr(words, '[^,]+', 1, o.oc) as word
 13             from occurence o
 14            cross join a
 15          ) s
 16            where s.word is not null
 17  order by id
 18  ;

        ID WORD                 COUNT_WORDS
---------- -------------------- -----------
         1 Cairo                          2
         1 CAIRO                          2
         1 EGYPT                          1
         1 JOHN                           1
         1 LONDAN                         2
         1 LONDON                         2
         1 OMAR ALI                       1
         1 OMAR AMR ALI                   1
         1 UK                             1
         1 YEMEN                          2
         1 YEMAN                          2
         2 ALI YAFOOZ                     1
         2 AYHAM SHAHER YAFOOZ            1
         2 SUDAI                          1
         2 SUDAIN                         1
         2 UK                             1
         3  AHMED ALI                     1
         3 MALAYSIA                       1
         3 MALYSIAN                       1

19 rows selected

Answer 4

您需要将数据作为单独的记录插入。如果你愿意的话，你可以把它们作为一个串联的字符串，但这只会让你的生活变得非常困难。所以：

create table words (
  id number,
  w  varchar2(100),
  s  varchar2(4)
);

create or replace trigger words_auto
  before insert or update on words
  for each row
begin
  select trim(upper(:new.w)), soundex(:new.w)
  into   :new.w,              :new.s
  from   dual;
end;

insert into words (id, w) values (1, 'UK');
insert into words (id, w) values (1, 'LONDON');
...
insert into words (id, w) values (3, ' AHMED ALI');
insert into words (id, w) values (3, 'MALYSIAN');

您可以编写一个过程来拆分连接字符串并适当填充words表。请注意，我创建了一个触发器，将输入规范化为大写，删除所有无关的空格并自动生成Soundex代码。

现在这里有一个问题：你想用Soundex代码对单词进行分组;但是，您如何确定基线？例如，'LONDON'和'LONDAN'都有代码'L535'，但你怎么知道哪个记录是'主'记录？...你不能，没有进一步的查找表！因此，您可以做的最好的事情是按Soundex代码分组。这不必存储在表中，作为一个视图更有意义：

create or replace view word_counts as
  select   id,
           s soundex,
           count(w) count_rows
  from     words
  group by id,
           s;

请注意，我调用了计数字段count_rows，因为它计算记录而不是不同的行。也就是说：“LONDON”，“LONDAN”和“LONDON”的记录显示的是3，而不是2（你可能会期待）。无论如何，对于您的数据，视图将如下所示：

id    soundex   count_rows
----- --------- -----------
1     U200      1
1     L535      2
...   ...       ...
3     M420      2
3     A534      1

正如我所说，如果没有进一步的基础设施，这真的是你能想到的最好的。

拆分并插入新表并计算这些单词

4 个答案:

这是SQLFiddle demo