返回SQL Server 2016中STRING_SPLIT位置的值

时间:2018-04-27 11:03:22

标签: sql-server

我可以使用SQL Server 2016或更高版本中的STRING_SPLIT函数返回特定位置的值吗?

我知道选择的顺序无法保证,但它是否与STRING_SPLIT一致?

DROP TABLE IF EXISTS #split

SELECT 'z_y_x' AS splitIt
INTO #split UNION
SELECT 'a_b_c'

SELECT * FROM #split;

WITH cte
AS (
SELECT      ROW_NUMBER() OVER ( PARTITION BY s.splitIt ORDER BY s.splitIt ) AS position,
            s.splitIt,
            value
FROM        #split s
CROSS APPLY STRING_SPLIT(s.splitIt, '_')
)
SELECT * FROM cte WHERE position = 2

这总是会返回第二个元素的值吗? b表示a_b_c,y表示z_y_x?

我不明白为什么Microsoft不会在此功能的值旁边返回位置指示器列。

5 个答案:

答案 0 :(得分:6)

-从v2016开始-解决方案via FROM OPENJSON()

DECLARE @str VARCHAR(100) = 'val1,val2,val3';

SELECT *
FROM OPENJSON('["' +  REPLACE(@str,',','","') + '"]');

结果

key value   type
0   val1    1
1   val2    1
2   val3    1

文档清楚地表明:

  

当OPENJSON解析JSON数组时,该函数将JSON文本中元素的索引作为键返回。

您的情况是:

SELECT 'z_y_x' AS splitIt
INTO #split UNION
SELECT 'a_b_c'

DECLARE @delimiter CHAR(1)='_';

SELECT * 
FROM #split
CROSS APPLY OPENJSON('["' +  REPLACE(splitIt,@delimiter,'","') + '"]') s
WHERE s.[key]=1; --zero based

让我们希望,STRING_SPLIT()的未来版本将包含此信息

UPDATE性能测试,与流行的 Jeff-Moden-splitter

进行比较

尝试一下:

USE master;
GO

CREATE DATABASE dbTest;
GO

USE dbTest;
GO
--Jeff Moden's splitter
CREATE FUNCTION [dbo].[DelimitedSplit8K](@pString VARCHAR(8000), @pDelimiter CHAR(1))
RETURNS TABLE WITH SCHEMABINDING AS
 RETURN
  WITH E1(N) AS (
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
                ),                          --10E+1 or 10 rows
       E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
       E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
 cteTally(N) AS (
                 SELECT TOP (ISNULL(DATALENGTH(@pString),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
                ),
cteStart(N1) AS (
                 SELECT 1 UNION ALL
                 SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(@pString,t.N,1) = @pDelimiter
                ),
cteLen(N1,L1) AS(
                 SELECT s.N1,
                        ISNULL(NULLIF(CHARINDEX(@pDelimiter,@pString,s.N1),0)-s.N1,8000)
                   FROM cteStart s
                )
 SELECT ItemNumber = ROW_NUMBER() OVER(ORDER BY l.N1),
        Item       = SUBSTRING(@pString, l.N1, l.L1)
   FROM cteLen l
;
GO
--Avoid first call bias
SELECT * FROM dbo.DelimitedSplit8K('a,b,c',',');
GO  

--Table to keep the results
CREATE TABLE Results(ID INT IDENTITY,ResultSource VARCHAR(100),durationMS INT, RowsCount INT);
GO
--Table with strings to split
CREATE TABLE dbo.DelimitedItems(ID INT IDENTITY,DelimitedNString nvarchar(4000),DelimitedString varchar(8000));
GO

-使用100个项目的随机混合字符串获取行
-尝试计算行数(GO后面的数)和TOP数

INSERT INTO DelimitedItems(DelimitedNString)
SELECT STUFF((
            SELECT TOP 100 ','+REPLACE(v.[name],',',';') 
            FROM master..spt_values v
            WHERE LEN(v.[name])>0
            ORDER BY NewID()
            FOR XML PATH('')),1,1,'')
--Keep it twice in varchar and nvarchar
UPDATE DelimitedItems SET DelimitedString=DelimitedNString;
GO 500 --create 500 differently mixed rows

-测试

DECLARE @d DATETIME2;

SET @d = SYSUTCDATETIME();
    SELECT DI.ID, DS.Item, DS.ItemNumber
    INTO #TEMP
    FROM dbo.DelimitedItems DI
         CROSS APPLY dbo.DelimitedSplit8K(DI.DelimitedNString,',') DS;
INSERT INTO Results(ResultSource,RowsCount,durationMS)
SELECT 'delimited8K with NVARCHAR(4000)'
      ,(SELECT COUNT(*) FROM #TEMP) AS RowCountInTemp
      ,DATEDIFF(MILLISECOND,@d,SYSUTCDATETIME()) AS Duration_NV_ms_delimitedSplit8K

SET @d = SYSUTCDATETIME();
    SELECT DI.ID, DS.Item, DS.ItemNumber
    INTO #TEMP2
    FROM dbo.DelimitedItems DI
         CROSS APPLY dbo.DelimitedSplit8K(DI.DelimitedString,',') DS;
INSERT INTO Results(ResultSource,RowsCount,durationMS)
SELECT 'delimited8K with VARCHAR(8000)'
      ,(SELECT COUNT(*) FROM #TEMP2) AS RowCountInTemp
      ,DATEDIFF(MILLISECOND,@d,SYSUTCDATETIME()) AS Duration_V_ms_delimitedSplit8K

SET @d = SYSUTCDATETIME();
    SELECT DI.ID, OJ.[Value] AS Item, OJ.[Key] AS ItemNumber
    INTO #TEMP3
    FROM dbo.DelimitedItems DI
         CROSS APPLY OPENJSON('["' +  REPLACE(DI.DelimitedNString,',','","') + '"]') OJ;
INSERT INTO Results(ResultSource,RowsCount,durationMS)
SELECT 'OPENJSON with NVARCHAR(4000)'
      ,(SELECT COUNT(*) FROM #TEMP3) AS RowCountInTemp
      ,DATEDIFF(MILLISECOND,@d,SYSUTCDATETIME()) AS Duration_NV_ms_OPENJSON

SET @d = SYSUTCDATETIME();
    SELECT DI.ID, OJ.[Value] AS Item, OJ.[Key] AS ItemNumber
    INTO #TEMP4
    FROM dbo.DelimitedItems DI
         CROSS APPLY OPENJSON('["' +  REPLACE(DI.DelimitedString,',','","') + '"]') OJ;
INSERT INTO Results(ResultSource,RowsCount,durationMS)
SELECT 'OPENJSON with VARCHAR(8000)'
      ,(SELECT COUNT(*) FROM #TEMP4) AS RowCountInTemp
      ,DATEDIFF(MILLISECOND,@d,SYSUTCDATETIME()) AS Duration_V_ms_OPENJSON
GO
SELECT * FROM Results;
GO

-清理

DROP TABLE #TEMP;
DROP TABLE #TEMP2;
DROP TABLE #TEMP3;
DROP TABLE #TEMP4;

USE master;
GO
DROP DATABASE dbTest;

结果:

500行中的200个项目

1220    delimited8K with NVARCHAR(4000)
 274    delimited8K with VARCHAR(8000)
 417    OPENJSON with NVARCHAR(4000)
 443    OPENJSON with VARCHAR(8000)

500行中的100个项目

421 delimited8K with NVARCHAR(4000)
140 delimited8K with VARCHAR(8000)
213 OPENJSON with NVARCHAR(4000)
212 OPENJSON with VARCHAR(8000)

5行中的100个项目

10  delimited8K with NVARCHAR(4000)
5   delimited8K with VARCHAR(8000)
3   OPENJSON with NVARCHAR(4000)
4   OPENJSON with VARCHAR(8000)

500行中有5个项目

32  delimited8K with NVARCHAR(4000)
30  delimited8K with VARCHAR(8000)
28  OPENJSON with NVARCHAR(4000)
24  OPENJSON with VARCHAR(8000)

-长度不受限制(仅适用于OPENJSON) -填充时没有TOP子句
-可产生500列中约500项

1329    OPENJSON with NVARCHAR(4000)
1117    OPENJSON with VARCHAR(8000)

简单:

  • 流行的拆分器功能不喜欢NVARCHAR
  • 该功能仅限于8k字节容量n之内的字符串
  • 只有在VARCHAR中有很多项目和很多行的情况下,分割器功能才在前面。
  • 在所有其他情况下,OPENJSON似乎或多或少都快...
  • OPENJSON可以处理(几乎)无限计数
  • OPENJSON对v2016的需求
  • 每个人都在等待STRING_SPLIT这个职位

UPDATE在测试中添加了STRING_SPLIT

同时,我使用STRING_SPLIT()通过另外两个测试部分重新运行该测试。作为位置,我必须返回一个硬编码值,因为此函数不会返回零件的索引。

在所有测试的案例中,OPENJSONSTRING_SPLIT接近,而且通常更快:

1000行中有5个项目

250 delimited8K with NVARCHAR(4000)
124 delimited8K with VARCHAR(8000) --this function is best with many rows in VARCHAR
203 OPENJSON with NVARCHAR(4000)
204 OPENJSON with VARCHAR(8000)
235 STRING_SPLIT with NVARCHAR(4000)
234 STRING_SPLIT with VARCHAR(8000)
30排

200个项目

140 delimited8K with NVARCHAR(4000)
31  delimited8K with VARCHAR(8000)
47  OPENJSON with NVARCHAR(4000)
31  OPENJSON with VARCHAR(8000)
47  STRING_SPLIT with NVARCHAR(4000)
31  STRING_SPLIT with VARCHAR(8000)
在10.000行中

100个项目

8145    delimited8K with NVARCHAR(4000)
2806    delimited8K with VARCHAR(8000) --fast with many rows!
5112    OPENJSON with NVARCHAR(4000)
4501    OPENJSON with VARCHAR(8000)
5028    STRING_SPLIT with NVARCHAR(4000)
5126    STRING_SPLIT with VARCHAR(8000)

答案 1 :(得分:3)

简单的答案是,不。到目前为止,微软拒绝在STRING_SPLIT中提供Ordinal位置作为返回数据集的一部分。您需要使用我害怕的其他解决方案。例如Jeff Moden的DelimitedSplit8k

(是的,我知道这或多或少只是一个链接答案,但是,在这里粘贴杰夫的解决方案实际上是抄袭)。

如果您使用Jeff的解决方案,那么您可以执行以下操作:

SELECT *
FROM dbo.DelimitedSplit8K('a,b,c,d,e,f,g,h,i,j,k',',') DS
WHERE ItemNumber = 2;

当然,您可能会传递列而不是文字字符串。

答案 2 :(得分:1)

我不想处理 OPENJSON ,但仍然想按索引获取string_split()值。 就我而言,性能并不是问题。

我使用了CTE(通用表表达式)

假设您有字符串str =“ part1 part2 part3”。

WITH split_res_list as
( 
    SELECT value FROM STRING_SPLIT('part1 part2 part3', ' ')
),
split_res_list_with_index as 
(
    SELECT [value],
           ROW_NUMBER() OVER (ORDER BY [value] ASC) as [RowNumber]
    FROM split_res_list
)
SELECT * FROM split_res_list_with_index WHERE RowNumber = 2

但是::请注意,根据 ORDER BY 的条件,三部分的顺序已更改!

The output for the second row with "part2" value:

答案 3 :(得分:1)

如果拆分的文本将包含换行符unicode其他非json兼容字符,我只是扩展了@Shnugo的回答,以使用 STRING_ESCAPE

我的测试代码用管道作为分隔符而不是逗号:

DECLARE @Separator VARCHAR(5) = '|'; -- or use any other separator
DECLARE @LongText VARCHAR(MAX) = 'Albert says: "baby, listen!"|ve Çağrı söylüyor: "Elma"|1st Line' + CHAR(13) + CHAR(10) + '2nd line';

SELECT * FROM OPENJSON('["' +  REPLACE(STRING_ESCAPE(@LongText, 'json'), @Separator ,'","') + '"]'); -- ok
-- SELECT * FROM OPENJSON('["' +  REPLACE(@LongText, @Separator ,'","') + '"]'); -- fails with: JSON text is not properly formatted. ...

答案 4 :(得分:0)

这是我的解决方法。我会按照问题等待更好的答案:

更新:原始代码没有考虑一个词是否包含另一个词 更新 2:生产中的性能很糟糕,所以我不得不另想办法。你在最后有它作为选项 2,表的实现:

在字符串中实现:

declare @a as nvarchar(100) = 'Lorem ipsum dolor dol ol sit amet. D  Lorem DO ipsum  DOL dolor sit amet. DOLORES ipsum';

WITH T AS (
    SELECT T1.value
            ,charindex(' ' + T1.value + ' ',' ' + @a + ' ' ,0) AS INDX
            ,RN = ROW_NUMBER() OVER (PARTITION BY value order BY value) 
        FROM STRING_SPLIT(@a, ' ') AS T1
        WHERE T1.value <> ''
),
R (VALUE,INDX,RN) AS (
    SELECT * 
    FROM T
    WHERE T.RN = 1
UNION ALL
    SELECT T.VALUE
        ,charindex(' ' + T.value + ' ',' ' + @a + ' ',R.INDX + 1) AS INDX
        ,T.RN
    FROM T
    JOIN R
        ON T.value = R.VALUE
            AND T.RN = R.RN + 1
)
SELECT * FROM R ORDER BY INDX

结果: tableOfResults

修改表中的列,选项 1:

WITH T AS (
    SELECT T1.stringToBeSplit
            ,T1.column1 --column1 is an example of column where stringToBeSplit is the same for more than one record. better to be avoid but if you need to added here it is how just follow column1 over the code
            ,T1.column2
            ,T1.value
            ,T1.column3
            /*,...any other column*/
            ,charindex(' ' + T1.value + ' ',' ' + T1.stringToBeSplit + ' ' ,0) AS INDX
            ,RN = ROW_NUMBER() OVER (PARTITION BY t1.column1, T1.stringToBeSplit, T1.value order BY T1.column1, T1.T1.stringToBeSplit, T1.value) --any column that create duplicates need to be added here as example i added column1
        FROM (SELECT TOP 10 * FROM YourTable D CROSS APPLY string_split(D.stringToBeSplit,' ')) AS T1
        WHERE T1.value <> ''
),
R (stringToBeSplit, column1, column2, value, column3, INDX, RN) AS (
    SELECT stringToBeSplit, column1, column2, value, column3, INDX, RN
    FROM T
    WHERE T.RN = 1
UNION ALL
    SELECT T.stringToBeSplit, T.column1, column2, T.value, T.column3
        ,charindex(' ' + T.value + ' ',' ' + T.stringToBeSplit + ' ',R.INDX + 1) AS INDX
        ,T.RN
    FROM T
    JOIN R
        ON T.value = R.VALUE AND T.COLUMN1 = R.COLUMN1 --any column that create duplicates need to be added here as exapmle i added column1
            AND T.RN = R.RN + 1
)
SELECT * FROM R ORDER BY column1, stringToBeSplit, INDX

修改表中的一列,选项 2(我可以获得的最大性能,主要操作来自删除连接并找到一种正确执行(和停止)CTE 递归循环的方法,从 1.30 开始,1000 行30K 行类似类型和长度的字符串需要 2 秒):

WITH T AS (
    SELECT  T1.stringToBeSplit --no extracolumns this time
            ,T1.value
            ,charindex(' ' + T1.value + ' ',' ' + T1.stringToBeSplit + ' ' ,0) AS INDX
            ,RN = ROW_NUMBER() OVER (PARTITION BY T1.stringToBeSplit,T1.value order BY T1.stringToBeSplit,T1.value) --from clause use distinct and where if possible
        FROM (SELECT DISTINCT stringToBeSplit, VALUE FROM [your table] D CROSS APPLY string_split(D.stringToBeSplit,' ') WHERE [your filter]) AS T1
        WHERE T1.value <> ''
),
R (stringToBeSplit, value, INDX, RN) AS (
    SELECT stringToBeSplit, value, INDX, RN
    FROM T
    WHERE T.RN = 1
UNION ALL
    SELECT R.stringToBeSplit, R.value
        ,charindex(' ' + R.value + ' ',' ' + R.stringToBeSplit + ' ',R.INDX + 1) AS INDX
        ,R.RN + 1
    FROM R
    WHERE charindex(' ' + R.value + ' ',' ' + R.stringToBeSplit + ' ',R.INDX + 1) <> 0
)
SELECT * FROM R ORDER BY stringToBeSplit, INDX

为了得到词序而不是 SELECT * FROM R USE:

SELECT stringToBeSplit ,value , ROW_NUMBER() OVER (PARTITION BY stringToBeSplit order BY [indX]) AS ORD FROM R 

如果您更喜欢一列而不是每个单词一个 RW:

select * FROM (SELECT [name 1],value , ROW_NUMBER() OVER (PARTITION BY [name 1] order BY [indX]) AS ORD FROM R ) as R2 

pivot (MAX(VALUE) FOR ORD in ([1],[2],[3]) ) AS PIV

如果您不想在此 link 中指定列数 QUOTNAME(),在我的情况下,我只需要前 4 个单词,其余暂时无关紧要。在页面代码下方以防链接失败:

DECLARE 
    @columns NVARCHAR(MAX) = '', 
    @sql     NVARCHAR(MAX) = '';

-- select the category names
SELECT 
    @columns+=QUOTENAME(category_name) + ','
FROM 
    production.categories
ORDER BY 
    category_name;

-- remove the last comma
SET @columns = LEFT(@columns, LEN(@columns) - 1);

-- construct dynamic SQL
SET @sql ='
SELECT * FROM   
(
    SELECT 
        category_name, 
        model_year,
        product_id 
    FROM 
        production.products p
        INNER JOIN production.categories c 
            ON c.category_id = p.category_id
) t 
PIVOT(
    COUNT(product_id) 
    FOR category_name IN ('+ @columns +')
) AS pivot_table;';

-- execute the dynamic SQL
EXECUTE sp_executesql @sql;

最后但并非最不重要的一点是,我真的很期待知道在 SQL Server 或 C# 中是否有一种更简单的方法具有相同的性能。我只是认为不使用外部信息的所有内容都应该留在服务器中并作为查询或批处理运行,但不确定是否诚实,因为我听到了相反的情况(特别是从使用熊猫的人那里),但还没有人说服我。< /p>