Question

我编写了以下函数，它接受两个字符串（以逗号分隔），将它们拆分为两个不同的临时表，然后使用这些临时表来查找这两个临时表中单词的百分比。问题是，当我在每行基础上使用大约200k行的数据集时，查询超时！您是否可以看到可以实现的任何优化？

ALTER FUNCTION [GetWordSimilarity](@String varchar(8000), 
@String2 varchar(8000),@Delimiter char(1))
returns decimal(16,2)        
as        
begin        
declare @result as decimal (16,2)
declare @temptable table (items varchar(8000))        
declare @temptable2 table (items varchar(8000))  
declare @numberOfCommonWords decimal(16,2)
declare @countTable1 decimal(16,2)
declare @countTable2 decimal(16,2)
declare @denominator decimal(16,2)
set @result = 0.0 --dummy value
declare @idx int        
declare @slice varchar(8000)        

select @idx = 1        
    if len(@String)<1 or @String is null  or len(@String2) = 0 or @String2 is null return 0.0

--populating @temptable
while @idx!= 0        
begin        
    set @idx = charindex(@Delimiter,@String)        
    if @idx!=0        
       set @slice = left(@String,@idx - 1)
    else        
        set @slice = @String

    if(len(@slice)>0)   
        insert into @temptable(Items) values(ltrim(rtrim(@slice)))        

    set @String = right(@String,len(@String) - @idx)        
    if len(@String) = 0 break        
end    

select @idx = 1

----populating @temptable2
while @idx!= 0        
begin        
    set @idx = charindex(@Delimiter,@String2)        
    if @idx!=0        
       set @slice = left(@String2,@idx - 1)
    else        
        set @slice = @String2

    if(len(@slice)>0)   
        insert into @temptable2(Items) values(ltrim(rtrim(@slice)))        

    set @String2 = right(@String2,len(@String2) - @idx)        
    if len(@String2) = 0 break        
end    

--calculating percentage of words match
if (((select COUNT(*) from @temptable) = 0) or ((select COUNT(*) from @temptable2) = 0))
    return 0.0

select @numberOfCommonWords = COUNT(*) from 
(
    select distinct items from @temptable
    intersect
    select distinct items from @temptable2
) a

select @countTable1 = COUNT (*) from @temptable
select @countTable2 = COUNT (*) from @temptable2

if(@countTable1 > @countTable2) set @denominator = @countTable1
else set @denominator = @countTable2

set @result = @numberOfCommonWords/@denominator

return @result
end

非常感谢！

Answer 1

没有办法编写一个带有大量字符串操作的T SQL UDF，它在大量行上表现良好。但是，如果您使用Numbers表，您将获得一些收益：

declare 
    @col_list varchar(1000),
    @sep char(1)

set @col_list = 'TransactionID, ProductID, ReferenceOrderID, ReferenceOrderLineID, TransactionDate, TransactionType, Quantity, ActualCost, ModifiedDate'
set @sep = ','

select substring(@col_list, n, charindex(@sep, @col_list + @sep, n) - n)
from numbers where substring(@sep + @col_list, n, 1) = @sep
and n < len(@col_list) + 1

您最好的做法是在SQLCLR中写下整个内容。

Answer 2

问题当然在于设计。您不应该以逗号分隔的数据存储在SQL数据库中。但是，我想我们现在一直坚持下去。要考虑的一件事是将函数转换为SQLCLR; SQL本身对字符串操作不是很好。（嗯，实际上，没有语言可以用字符串操作恕我直言，但SQL真的很糟糕=）

用于填充@Temptables 1＆amp;的分离器2可以使用Jeff Moden的代码进行优化，他编写了几篇精彩文章，其中最后一篇可以在这里找到：http://www.sqlservercentral.com/articles/Tally+Table/72993/

采用他的分离器+优化其余代码，我设法在200K随机数据样本上从771秒到305秒。有些事情需要注意：结果并不完全相同。我手动检查了一些，我实际上认为新的结果更准确但是没有时间在两个版本上进行bughunting。

我尝试将其转换为更基于集合的方法，其中我首先加载表中包含所有row_id的所有单词的所有单词，然后将它们连接在一起。尽管连接速度非常快，但创建初始表只需要很长时间，因此它甚至会丢失原始函数。

也许我会尝试找出另一种方法让它更快但是现在我希望这会对你有所帮助。

ALTER FUNCTION [GetWordSimilarity2](@String1 varchar(8000), 
@String2 varchar(8000),@Delimiter char(1))
returns decimal(16,2)        
as        
begin        
declare @temptable1 table (items varchar(8000), row_id int IDENTITY(1, 1), PRIMARY KEY (items, row_id))        
declare @temptable2 table (items varchar(8000), row_id int IDENTITY(1, 1), PRIMARY KEY (items, row_id))   
declare @numberOfCommonWords decimal(16,2)
declare @countTable1 decimal(16,2)
declare @countTable2 decimal(16,2)

-- based on code from Jeff Moden (http://www.sqlservercentral.com/articles/Tally+Table/72993/)

--populating @temptable1
 ;WITH E1(N) AS (
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
                ),                          --10E+1 or 10 rows
       E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
       E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
 cteTally(N) AS (--==== This provides the "base" CTE and limits the number of rows right up front
                     -- for both a performance gain and prevention of accidental "overruns"
                 SELECT TOP (ISNULL(DATALENGTH(@String1),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
                ),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
                 SELECT 1 UNION ALL
                 SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(@String1,t.N,1) = @Delimiter
                ),
cteLen(N1,L1) AS(--==== Return start and length (for use in substring)
                 SELECT s.N1,
                        ISNULL(NULLIF(CHARINDEX(@Delimiter,@String1,s.N1),0)-s.N1,8000)
                   FROM cteStart s
                )
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
INSERT @temptable1 (items)
 SELECT Item       = SUBSTRING(@String1, l.N1, l.L1)
   FROM cteLen l

SELECT @countTable1 = @@ROWCOUNT

----populating @temptable2
 ;WITH E1(N) AS (
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
                ),                          --10E+1 or 10 rows
       E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
       E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
 cteTally(N) AS (--==== This provides the "base" CTE and limits the number of rows right up front
                     -- for both a performance gain and prevention of accidental "overruns"
                 SELECT TOP (ISNULL(DATALENGTH(@String2),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
                ),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
                 SELECT 1 UNION ALL
                 SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(@String2,t.N,1) = @Delimiter
                ),
cteLen(N1,L1) AS(--==== Return start and length (for use in substring)
                 SELECT s.N1,
                        ISNULL(NULLIF(CHARINDEX(@Delimiter,@String2,s.N1),0)-s.N1,8000)
                   FROM cteStart s
                )
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
INSERT @temptable2 (items)
 SELECT Item       = SUBSTRING(@String2, l.N1, l.L1)
   FROM cteLen l

SELECT @countTable2 = @@ROWCOUNT

--calculating percentage of words match
if @countTable1 = 0 OR @countTable2 = 0
    return 0.0

select @numberOfCommonWords = COUNT(DISTINCT t1.items) 
    from @temptable1 t1
    JOIN @temptable2 t2
    ON t1.items = t2.items


RETURN @numberOfCommonWords / (CASE WHEN (@countTable1 > @countTable2) THEN @countTable1 ELSE @countTable2 END)

end

Sql UDF优化

2 个答案: