SQL如何优化拆分字符串并将字词插入新表?

时间:2017-05-06 02:07:07

标签: sql sql-server

有没有在更短的时间内完成这项工作?我从我的案例表中获取摘要列,并使用以下循环将数据逐字拆分到我的单词表中:

Example case table
CaseID | CaseNumber | Summary
1        111111       This is a summary
2        111112       This is Summary 2 

DECLARE 
@n int = 1
;
WHILE @n <= 1000
BEGIN
INSERT INTO words (caseID, caseNumber, pn, word)
SELECT caseID, caseNumber, pn, word FROM dbo.Split6(' ', (select summary 
from 
cases where caseID = @n)) where caseID = @n group by caseID,caseNumber, pn, 
word
option (maxrecursion 0)
SET @n = @n+1;
END
GO

它有效,但速度很慢。花了3个小时来分解1000例。我有10万箱。有没有办法可以更有效地做到这一点?这是我正在使用的分割功能:

Split6 function:
CREATE FUNCTION [dbo].[Split6] (
@sep CHAR(1) 
,@s nVARCHAR(4000) 
)
RETURNS TABLE
AS
RETURN (
WITH Pieces(caseID,caseNumber, pn, start, stop) AS (
            SELECT cs.caseID
            ,cs.caseNumber
                ,1
                ,1
                ,CHARINDEX(@sep, @s)
            FROM cases cs

            UNION ALL

            SELECT caseID
                ,caseNumber
                ,pn + 1
                ,stop + 1
                ,CHARINDEX(@sep, @s, stop + 1)
            FROM Pieces
            WHERE stop > 0
            )
    SELECT caseID
        ,caseNumber
        ,pn
        ,SUBSTRING(@s, start, CASE 
                WHEN stop > 0
                    THEN stop - start
                ELSE 512
                END) AS word
    FROM Pieces
    )  GO

1 个答案:

答案 0 :(得分:0)

你应该尽可能避免循环。

以下使用Parse / Split功能与Cross Apply一起使用(使用Outer Apply显示空值)。

就性能而言......使用100,000个记录的测试样本,平均每个5个字,执行时间为2.2秒。

示例

Declare @YourTable Table ([CaseID] varchar(50),[CaseNumber] varchar(50),[Summary] varchar(50))
Insert Into @YourTable Values
 (1,111111,'This is a summary')
,(2,111112,'This is Summary 2')

Select A.CaseID
      ,A.CaseNumber
      ,B.* 
 From @YourTable A
 Cross Apply [dbo].[udf-Str-Parse](A.Summary,' ') B

<强>返回

CaseID  CaseNumber  RetSeq  RetVal
1       111111      1       This
1       111111      2       is
1       111111      3       a
1       111111      4       summary
2       111112      1       This
2       111112      2       is
2       111112      3       Summary
2       111112      4       2

感兴趣的UDF

CREATE FUNCTION [dbo].[udf-Str-Parse] (@String varchar(max),@Delimiter varchar(10))
Returns Table 
As
Return (  
    Select RetSeq = Row_Number() over (Order By (Select null))
          ,RetVal = LTrim(RTrim(B.i.value('(./text())[1]', 'varchar(max)')))
    From  (Select x = Cast('<x>' + replace((Select replace(@String,@Delimiter,'§§Split§§') as [*] For XML Path('')),'§§Split§§','</x><x>')+'</x>' as xml).query('.')) as A 
    Cross Apply x.nodes('x') AS B(i)
);
--Thanks Shnugo for making this XML safe
--Select * from [dbo].[udf-Str-Parse]('Dog,Cat,House,Car',',')
--Select * from [dbo].[udf-Str-Parse]('John Cappelletti was here',' ')
--Select * from [dbo].[udf-Str-Parse]('this,is,<test>,for,< & >',',')
  

编辑 - 另一个解析/拆分功能

以下TVF比XML版本略快,但限制为8K。例如,在5,000个样本记录中,平均为36个“单词”,比XML版本快20ms。

CREATE FUNCTION [dbo].[udf-Str-Parse-8K] (@String varchar(max),@Delimiter varchar(25))
Returns Table 
As
Return (  
    with   cte1(N)   As (Select 1 From (Values(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) N(N)),
           cte2(N)   As (Select Top (IsNull(DataLength(@String),0)) Row_Number() over (Order By (Select NULL)) From (Select N=1 From cte1 a,cte1 b,cte1 c,cte1 d) A ),
           cte3(N)   As (Select 1 Union All Select t.N+DataLength(@Delimiter) From cte2 t Where Substring(@String,t.N,DataLength(@Delimiter)) = @Delimiter),
           cte4(N,L) As (Select S.N,IsNull(NullIf(CharIndex(@Delimiter,@String,s.N),0)-S.N,8000) From cte3 S)

    Select RetSeq = Row_Number() over (Order By A.N)
          ,RetVal = LTrim(RTrim(Substring(@String, A.N, A.L)))
    From   cte4 A
);
--Orginal Source http://www.sqlservercentral.com/articles/Tally+Table/72993/
--Select * from [dbo].[udf-Str-Parse-8K]('Dog,Cat,House,Car',',')
--Select * from [dbo].[udf-Str-Parse-8K]('John||Cappelletti||was||here','||')