将[每行一个单词]加入到包含[每行多个单词]的短语行中

时间:2011-02-08 04:47:53

标签: sql sql-server tsql sql-server-2008

请原谅问题的长度。我提供了一个测试脚本来演示情况以及我对解决方案的最佳尝试。

有两个表:

  1. test_WORDS =从多个来源按顺序提取的单词。 OBJ_FK列是来源的ID。 WORD_ID是单词本身的标识符,在源中是唯一的。每行包含一个单词。
  2. test_PHRASE =要在test_WORDS中搜索的词组列表。 PHRASE_TEXT列是一个空格分隔的短语,如'foo bar'(见下文),因此每行包含多个单词。
  3. 要求: 返回test_WORDS中的第一个单词,即来自test_PHRASE的匹配短语的开头。

    我更喜欢基于以下的RBAR方法设置的东西。我的解决方案仅限于5个单词短语。我需要支持多达20个单词短语。是否可以将test_PHRASE中一行中的单词与test_WORD中没有游标的连续行匹配?

    将短语单词分解为临时表后,问题归结为按行顺序将两组的部分匹配在一起。

    -- Create test data
    CREATE TABLE [dbo].[test_WORDS](
        [OBJ_FK] [bigint] NOT NULL,             --FK to the source object
        [WORD_ID] [int] NOT NULL,               --The word order in the source object
        [WORD_TEXT] [nvarchar](50) NOT NULL,
         CONSTRAINT [PK_test_WORDS] PRIMARY KEY CLUSTERED 
        (
            [OBJ_FK] ASC,
            [WORD_ID] ASC
        )
    ) ON [PRIMARY]    
    GO
    
    CREATE TABLE [dbo].[test_PHRASE](
        [ID] [int],     --PHRASE ID
        [PHRASE_TEXT] [nvarchar](150) NOT NULL  --Space-separated phrase
         CONSTRAINT [PK_test_PHRASE] PRIMARY KEY CLUSTERED 
        (
            [ID] ASC
        )
    )
    GO
    INSERT INTO dbo.test_WORDS
    SELECT 1,1,'aaa' UNION ALL
    SELECT 1,2,'bbb' UNION ALL
    SELECT 1,3,'ccc' UNION ALL
    SELECT 1,4,'ddd' UNION ALL
    SELECT 1,5,'eee' UNION ALL
    SELECT 1,6,'fff' UNION ALL
    SELECT 1,7,'ggg' UNION ALL
    SELECT 1,8,'hhh' UNION ALL
    SELECT 2,1,'zzz' UNION ALL
    SELECT 2,2,'yyy' UNION ALL
    SELECT 2,3,'xxx' UNION ALL
    SELECT 2,4,'www'
    
    INSERT INTO dbo.test_PHRASE
    SELECT 1, 'bbb ccc ddd' UNION ALL --should match 
    SELECT 2, 'ddd eee fff' UNION ALL --should match 
    SELECT 3, 'xxx xxx xxx' UNION ALL --should NOT match 
    SELECT 4, 'zzz yyy xxx' UNION ALL --should match 
    SELECT 5, 'xxx www ppp' UNION ALL --should NOT match 
    SELECT 6, 'zzz yyy xxx www'    --should match 
    
    -- Create variables
    DECLARE @maxRow AS INTEGER
    DECLARE @currentRow AS INTEGER
    DECLARE @phraseSubsetTable AS TABLE(
        [ROW] int IDENTITY(1,1) NOT NULL,
        [ID] int NOT NULL,      --PHRASE ID
        [PHRASE_TEXT] nvarchar(150) NOT NULL
    )
    --used to split the phrase into words
    --note:  No permissions to sys.dm_fts_parser
    DECLARE @WordList table
    (
        ID int,
        WORD nvarchar(50)
    )
    --Records to be returned to caller
    DECLARE @returnTable AS TABLE(
        OBJECT_FK INT NOT NULL,
        WORD_ID INT NOT NULL,
        PHRASE_ID INT NOT NULL
    )
    DECLARE @phrase AS NVARCHAR(150)
    DECLARE @phraseID AS INTEGER
    
    -- Get subset of phrases to simulate a join that would occur in production
    INSERT INTO @phraseSubsetTable 
    SELECT ID, PHRASE_TEXT 
    FROM dbo.test_PHRASE
    --represent subset of phrases caused by join in production
    WHERE ID IN (2,3,4)
    
    -- Loop each phrase in the subset, split into rows of words and return matches to the test_WORDS table
    SET @maxRow = @@ROWCOUNT
    SET @currentRow = 1
    WHILE @currentRow <= @maxRow
    BEGIN
        SELECT @phrase=PHRASE_TEXT, @phraseID=ID FROM @phraseSubsetTable WHERE row = @currentRow
    
        --clear previous phrase that was split into rows
        DELETE FROM @WordList
    
        --Recursive Function with CTE to create recordset of words, one per row
        ;WITH Pieces(pn, start, stop) AS (
          SELECT 1, 1, CHARINDEX(' ', @phrase)
          UNION ALL
          SELECT pn + 1, stop + 1, CHARINDEX(' ', @phrase, stop + 1)
          FROM Pieces
          WHERE stop > 0)
        --Create the List of words with the CTE above
        insert into @WordList
        SELECT pn,
          SUBSTRING(@phrase, start, CASE WHEN stop > 0 THEN stop-start ELSE 1056 END) AS WORD
        FROM Pieces
    
        DECLARE @wordCt as int
        select @wordCt=count(ID) from @WordList;
    
        -- Do the actual query using a CTE with a rownumber that repeats for every SOURCE OBJECT
    ;WITH WordOrder_CTE AS (
    SELECT OBJ_FK, WORD_ID, WORD_TEXT,
        ROW_NUMBER() OVER (Partition BY OBJ_FK ORDER BY WORD_ID) AS rownum 
    FROM test_WORDS)
    --CREATE a flattened record of the first word in the phrase and join it to the rest of the words.
    INSERT INTO @returnTable  
    SELECT r1.OBJ_FK, r1.WORD_ID, @phraseID AS PHRASE_ID
    FROM WordOrder_CTE r1 
    INNER JOIN @WordList w1 ON r1.WORD_TEXT = w1.WORD and w1.ID=1
    LEFT JOIN WordOrder_CTE r2 
            ON r1.rownum = r2.rownum - 1 and r1.OBJ_FK = r2.OBJ_FK
                LEFT JOIN @WordList w2 ON r2.WORD_TEXT = w2.WORD and w2.ID=2
    LEFT JOIN WordOrder_CTE r3 
            ON r1.rownum = r3.rownum - 2 and r1.OBJ_FK = r3.OBJ_FK 
                LEFT JOIN @WordList w3 ON r3.WORD_TEXT = w3.WORD and w3.ID=3
    LEFT JOIN WordOrder_CTE r4
            ON r1.rownum = r4.rownum - 3 and r1.OBJ_FK = r4.OBJ_FK
                LEFT JOIN @WordList w4 ON r4.WORD_TEXT = w4.WORD and w4.ID=4
    LEFT JOIN WordOrder_CTE r5
            ON r1.rownum = r5.rownum - 4 and r1.OBJ_FK = r5.OBJ_FK
                LEFT JOIN @WordList w5 ON r5.WORD_TEXT = w5.WORD and w5.ID=5
    
    WHERE   (@wordCt < 2 OR w2.ID is not null) and
            (@wordCt < 3 OR w3.ID is not null) and
            (@wordCt < 4 OR w4.ID is not null) and
            (@wordCt < 5 OR w5.ID is not null)
    
        --loop
        SET @currentRow = @currentRow+1
    END 
    
    --Return the first words of each matching phrase
    SELECT  OBJECT_FK, WORD_ID, PHRASE_ID FROM @returnTable
    
    GO
    
    --Clean up
    DROP TABLE [dbo].[test_WORDS]
    DROP TABLE [dbo].[test_PHRASE]
    

    已编辑的解决方案:

    这是对下面提供的正确解决方案的编辑,以说明非连续的字ID。希望这能帮到像我这样的人。

    ;WITH
    numberedwords AS (
      SELECT
        OBJ_FK,
        WORD_ID,
        WORD_TEXT,
        rowcnt =  ROW_NUMBER() OVER
          (PARTITION BY OBJ_FK ORDER BY WORD_ID DESC),
        totalInSrc = COUNT(WORD_ID) OVER (PARTITION BY OBJ_FK)
      FROM dbo.test_WORDS
    ),
    phrasedwords AS (
      SELECT
        nw1.OBJ_FK,
        nw1.WORD_ID,
        nw1.WORD_TEXT,
        PHRASE_TEXT = RTRIM((
          SELECT [text()] = nw2.WORD_TEXT + ' '
          FROM numberedwords nw2
          WHERE nw1.OBJ_FK = nw2.OBJ_FK
             AND nw2.rowcnt BETWEEN nw1.rowcnt AND nw1.totalInSrc
          ORDER BY nw2.OBJ_FK, nw2.WORD_ID
          FOR XML PATH ('')
        ))
      FROM numberedwords nw1
      GROUP BY nw1.OBJ_FK, nw1.WORD_ID, nw1.WORD_TEXT, nw1.rowcnt, nw1.totalInSrc
    )
    SELECT *
    FROM phrasedwords pw
      INNER JOIN test_PHRASE tp
        ON LEFT(pw.PHRASE_TEXT, LEN(tp.PHRASE_TEXT)) = tp.PHRASE_TEXT
    ORDER BY pw.OBJ_FK, pw.WORD_ID
    

    注意:我在生产中使用的最终查询使用索引临时表而不是CTE。我还根据我的需要限制了PHRASE_TEXT列的长度。通过这些改进,我能够将查询时间从3分钟缩短到3秒!

3 个答案:

答案 0 :(得分:3)

这是一个使用不同方法的解决方案:它不是将短语分成单词,而是将单词组合成短语。

已修改:将rowcnt表达式更改为使用COUNT(*) OVER …,正如评论中@ErikE所建议的那样。

;WITH
numberedwords AS (
  SELECT
    OBJ_FK,
    WORD_ID,
    WORD_TEXT,
    rowcnt =  COUNT(*) OVER (PARTITION BY OBJ_FK)
  FROM dbo.test_WORDS
),
phrasedwords AS (
  SELECT
    nw1.OBJ_FK,
    nw1.WORD_ID,
    nw1.WORD_TEXT,
    PHRASE_TEXT = RTRIM((
      SELECT [text()] = nw2.WORD_TEXT + ' '
      FROM numberedwords nw2
      WHERE nw1.OBJ_FK = nw2.OBJ_FK
        AND nw2.WORD_ID BETWEEN nw1.WORD_ID AND nw1.rowcnt
      ORDER BY nw2.OBJ_FK, nw2.WORD_ID
      FOR XML PATH ('')
    ))
  FROM numberedwords nw1
  GROUP BY nw1.OBJ_FK, nw1.WORD_ID, nw1.WORD_TEXT, nw1.rowcnt
)
SELECT *
FROM phrasedwords pw
  INNER JOIN test_PHRASE tp
    ON LEFT(pw.PHRASE_TEXT, LEN(tp.PHRASE_TEXT)) = tp.PHRASE_TEXT
ORDER BY pw.OBJ_FK, pw.WORD_ID

答案 1 :(得分:0)

使用Split功能应该有效。

拆分功能

CREATE FUNCTION dbo.Split
(
    @RowData nvarchar(2000),
    @SplitOn nvarchar(5)
)  
RETURNS @RtnValue table 
(
    Id int identity(1,1),
    Data nvarchar(100)
) 
AS  
BEGIN 
    Declare @Cnt int
    Set @Cnt = 1

    While (Charindex(@SplitOn,@RowData)>0)
    Begin
        Insert Into @RtnValue (data)
        Select 
            Data = ltrim(rtrim(Substring(@RowData,1,Charindex(@SplitOn,@RowData)-1)))

        Set @RowData = Substring(@RowData,Charindex(@SplitOn,@RowData)+1,len(@RowData))
        Set @Cnt = @Cnt + 1
    End

    Insert Into @RtnValue (data)
    Select Data = ltrim(rtrim(@RowData))

    Return
END

SQL声明

SELECT  DISTINCT p.*
FROM    dbo.test_PHRASE p
        LEFT OUTER JOIN (
          SELECT  p.ID
          FROM    dbo.test_PHRASE p
                  CROSS APPLY dbo.Split(p.PHRASE_TEXT, ' ') sp
                  LEFT OUTER JOIN dbo.test_WORDS w ON w.WORD_TEXT = sp.Data 
          WHERE   w.OBJ_FK IS NULL
        ) ignore ON ignore.ID = p.ID
WHERE   ignore.ID IS NULL        

答案 2 :(得分:0)

这比其他解决方案的效果要好一些。如果您不需要WORD_ID,只需要WORD_TEXT,就可以删除整列。我知道这是一年多以前的事了,但我想知道你是否可以在3秒内缩短到30毫秒? :)

如果这个查询看起来不错,那么我最大的速度建议就是将整个短语放到一个单独的表中(使用你的示例数据,它只有2行,长度为8个单词和4个单词)。

SELECT
   W.OBJ_FK,
   X.Phrase,
   P.*,
   Left(P.PHRASE_TEXT, 
      IsNull(NullIf(CharIndex(' ', P.PHRASE_TEXT), 0) - 1, 2147483647)
   ) WORD_TEXT,
   Len(Left(X.Phrase, PatIndex('%' + P.PHRASE_TEXT + '%', ' ' + X.Phrase) - 1))
      - Len(Replace(
         Left(X.Phrase, PatIndex('%' + P.PHRASE_TEXT + '%', X.Phrase) - 1), ' ', '')
      )
      WORD_ID
FROM
   (SELECT DISTINCT OBJ_FK FROM dbo.test_WORDS) W
   CROSS APPLY (
      SELECT RTrim((SELECT WORD_TEXT + ' '
      FROM dbo.test_WORDS W2
      WHERE W.OBJ_FK = W2.OBJ_FK
      ORDER BY W2.WORD_ID
      FOR XML PATH (''))) Phrase
   ) X
   INNER JOIN dbo.test_PHRASE P
      ON X.Phrase LIKE '%' + P.PHRASE_TEXT + '%';

这是好奇心的另一个版本。它表现不佳。

WITH Calc AS (
   SELECT
      P.ID,
      P.PHRASE_TEXT,
      W.OBJ_FK,
      W.WORD_ID StartID,
      W.WORD_TEXT StartText,
      W.WORD_ID,
      Len(W.WORD_TEXT) + 2 NextPos,
      Convert(varchar(150), W.WORD_TEXT) MatchingPhrase
   FROM
      dbo.test_PHRASE P
      INNER JOIN dbo.test_WORDS W
         ON P.PHRASE_TEXT + ' ' LIKE W.WORD_TEXT + ' %'
   UNION ALL
   SELECT
      C.ID,
      C.PHRASE_TEXT,
      C.OBJ_FK,
      C.StartID,
      C.StartText,
      W.WORD_ID,
      C.NextPos + Len(W.WORD_TEXT) + 1,
      Convert(varchar(150), C.MatchingPhrase + Coalesce(' ' + W.WORD_TEXT, ''))
   FROM
      Calc C
      INNER JOIN dbo.test_WORDS W
         ON C.OBJ_FK = W.OBJ_FK
         AND C.WORD_ID + 1 = W.WORD_ID
         AND Substring(C.PHRASE_TEXT, C.NextPos, 2147483647) + ' ' LIKE W.WORD_TEXT + ' %'
)
SELECT C.OBJ_FK, C.PHRASE_TEXT, C.StartID, C.StartText, C.ID
FROM Calc C
WHERE C.PHRASE_TEXT = C.MatchingPhrase;