我使用的是Oracle 11g,我想将一个列(JobDescription)从Persons表中拆分成单独的单词。
即如果人员A的职位描述是"专业StackOverflow贡献者",我想填充另一个包含3个行的表,其中包含职位描述中的3个单词。
在另一篇文章here中,我设法获得了以下适用于较小数据集的内容。但我的表中包含的记录少于50万条,而且该声明现在已经运行了2天,而且还在继续。
INSERT INTO WORDS (PersonID, Department, Word)
SELECT distinct PersonID, Department, trim(regexp_substr(str, '[^,]+', 1, level))
FROM (SELECT PersonID, Department, trim(Replace(JobDescription, ' ', ',')) str
FROM Persons) t
CONNECT BY instr( str , ',', 1, level - 1) > 0;
是否有其他选项可能会导致更快的结果?
答案 0 :(得分:0)
对于一次性工作,我认为没有理由不去处理程序。这应该足够快(对于我的系统上的250万行表,250秒)。如果您的单词可以超过40个字符,请更改varchar2变量的大小。
create or replace procedure tmp_split_job as
TYPE wtype IS TABLE OF NUMBER INDEX BY VARCHAR2(40);
uwords wtype;
w varchar2(40);
i pls_integer;
n pls_integer;
p pls_integer;
cursor c_fetch is select PersonID, Department, JobDescription from Persons where JobDescription is not null;
begin
for v_row in c_fetch loop
n := length(v_row.JobDescription);
i := 1;
while i <= n loop
p := instr(v_row.JobDescription, ' ', i);
if p > 1 then
w := substr(v_row.JobDescription, i, p-i);
i := p + 1;
else
w := substr(v_row.JobDescription, i);
i := n + 1;
end if;
uwords(w) := 1;
end loop;
w := uwords.FIRST;
while w is not null loop
insert into words (PersonID, Department, Word) values (v_row.PersonID, v_row.Department, w);
w := uwords.next(w);
end loop;
uwords.DELETE;
end loop;
end;
/
exec tmp_split_job;
drop procedure tmp_split_job;