使用缺少的标记在CDATA中提取XML节点

时间:2017-03-09 21:03:38

标签: sql-server xml-parsing cdata

我的XML包含未关闭的html标签。所以我在其中嵌入了CDATA,因此不会出错。如何提取不同的XML节点。

CREATE tABLE dbo.temp(ID int, input varchar(max))

INSERT into dbo.temp(1,'<?xml version="1.0" encoding="iso-8859-1"?>  <!DOCTYPE title [ <!ELEMENT title ANY > <!ENTITY xxe SYSTEM "https://grepular.com/xxe.txt" >]>
    <customer>
    <![CDATA[<TransmissionId>5555</TransmissionId>
    <HeadLine>Hair Loss &amp; Growth Treatments and Products Sales Market Research Report 2016-2021</p></HeadLine>
    ]]></customer>')

我们看到标题节点中有</p>没有匹配的

。如何从此

中提取节点

由于 MR

1 个答案:

答案 0 :(得分:2)

你可以使用像left,right和charindex这样的字符串函数,但我发现这很乏味。如果您对TVF开放,它将根据提供的模式提取值。我应该补充一点,这不仅限于XML标签

作为TVF,将返回一个或多个值。

示例

Declare @YourTable table (ID int,input varchar(max))
Insert Into @YourTable Values
(1,'<?xml version="1.0" encoding="iso-8859-1"?>  <!DOCTYPE title [ <!ELEMENT title ANY > <!ENTITY xxe SYSTEM "https://grepular.com/xxe.txt" >]>
    <customer>
    <![CDATA[<TransmissionId>5555</TransmissionId>
    <HeadLine>Hair Loss &amp; Growth Treatments and Products Sales Market Research Report 2016-2021</p></HeadLine>
    ]]></customer>')


Select A.ID
      ,B.RetVal
 From  @YourTable A
 Cross Apply [dbo].[udf-Str-Extract](A.Input,'<HeadLine>','</HeadLine>') B

<强>返回

ID  RetVal
1   Hair Loss &amp; Growth Treatments and Products Sales Market Research Report 2016-2021</p>

感兴趣的UDF

CREATE FUNCTION [dbo].[udf-Str-Extract] (@String varchar(max),@Delimiter1 varchar(100),@Delimiter2 varchar(100))
Returns Table 
As
Return (  

with   cte1(N)   As (Select 1 From (Values(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) N(N)),
       cte2(N)   As (Select Top (IsNull(DataLength(@String),0)) Row_Number() over (Order By (Select NULL)) From (Select N=1 From cte1 N1,cte1 N2,cte1 N3,cte1 N4,cte1 N5,cte1 N6) A ),
       cte3(N)   As (Select 1 Union All Select t.N+DataLength(@Delimiter1) From cte2 t Where Substring(@String,t.N,DataLength(@Delimiter1)) = @Delimiter1),
       cte4(N,L) As (Select S.N,IsNull(NullIf(CharIndex(@Delimiter1,@String,s.N),0)-S.N,8000) From cte3 S)

Select RetSeq = Row_Number() over (Order By N)
      ,RetPos = N
      ,RetLen = charindex(@Delimiter2,RetVal)-1
      ,RetVal = left(RetVal,charindex(@Delimiter2,RetVal)-1)
 From (Select A.N,RetVal = ltrim(rtrim(Substring(@String, A.N, A.L))) From cte4 A ) A
 Where charindex(@Delimiter2,RetVal)>1
)
/*
Max Length of String 1MM characters

Declare @String varchar(max) = 'Dear [[FirstName]] [[LastName]], ...'
Select * From [dbo].[udf-Str-Extract] (@String,'[[',']]')
*/