有没有办法在SQL Server中进行HTML解码?

时间:2015-01-31 15:11:55

标签: sql-server sql-server-2012

我的一张表中有以下记录

CD&M Communications 
auburndale oil & propane inc  
C F La Fountaine #7561  
Laramie County Fire District # 2  
AmeriGas Propane LP #2250  

有没有办法删除像"&",#7561,#2250等字符。

"&安培;"应替换为&根据C#HTMLDECODE函数

4 个答案:

答案 0 :(得分:10)

以下SQL函数适用于您的情况,或者它是您扩展它的一个很好的起点。但请注意,与应用程序层中的字符串操作相比,数据库[SQL Server]中的字符串操作会更慢。

GO

IF OBJECT_ID('dbo.MyHTMLDecode') IS NOT NULL BEGIN DROP FUNCTION dbo.MyHTMLDecode END

GO
CREATE FUNCTION dbo.MyHTMLDecode (@vcWhat VARCHAR(MAX))
RETURNS VARCHAR(MAX)
AS
BEGIN
    DECLARE @vcResult VARCHAR(MAX)
    DECLARE @siPos INT
        ,@vcEncoded VARCHAR(7)
        ,@siChar INT

    SET @vcResult = RTRIM(LTRIM(CAST(REPLACE(@vcWhat COLLATE Latin1_General_BIN, CHAR(0), '') AS VARCHAR(MAX))))

    SELECT @vcResult = REPLACE(REPLACE(@vcResult, ' ', ' '), ' ', ' ')

    IF @vcResult = ''
        RETURN @vcResult

    SELECT @siPos = PATINDEX('%&#[0-9][0-9][0-9];%', @vcResult)

    WHILE @siPos > 0
    BEGIN
        SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, 6)
            ,@siChar = CAST(SUBSTRING(@vcEncoded, 3, 3) AS INT)
            ,@vcResult = REPLACE(@vcResult, @vcEncoded, NCHAR(@siChar))
            ,@siPos = PATINDEX('%&#[0-9][0-9][0-9];%', @vcResult)
    END

    SELECT @siPos = PATINDEX('%&#[0-9][0-9][0-9][0-9];%', @vcResult)

    WHILE @siPos > 0
    BEGIN
        SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, 7)
            ,@siChar = CAST(SUBSTRING(@vcEncoded, 3, 4) AS INT)
            ,@vcResult = REPLACE(@vcResult, @vcEncoded, NCHAR(@siChar))
            ,@siPos = PATINDEX('%&#[0-9][0-9][0-9][0-9];%', @vcResult)
    END

    SELECT @siPos = PATINDEX('%#[0-9][0-9][0-9][0-9]%', @vcResult)

    WHILE @siPos > 0
    BEGIN
        SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, 5)
            ,@vcResult = REPLACE(@vcResult, @vcEncoded, '')
            ,@siPos = PATINDEX('%#[0-9][0-9][0-9][0-9]%', @vcResult)
    END

    SELECT @vcResult = REPLACE(REPLACE(@vcResult, NCHAR(160), ' '), CHAR(160), ' ')

    SELECT @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult, '&amp;', '&'), '&quot;', '"'), '&lt;', '<'), '&gt;', '>'), '&amp;amp;', '&')

    RETURN @vcResult
END

GO

插图:

  DECLARE @S VARCHAR(MAX)='CD&amp;amp;amp;M Communications 
    auburndale oil &amp;amp;amp; propane inc  
    C F La Fountaine #7561  
    Laramie County Fire District # 2  
    AmeriGas Propane LP #2250'

    SELECT dbo.MyHTMLDecode (@s)

输出:

CD&M Communications 
auburndale oil & propane inc  
C F La Fountaine   
Laramie County Fire District # 2  
AmeriGas Propane LP 

答案 1 :(得分:10)

有一个更简单的解决方案......

SQL Server支持XML数据类型,它支持解码XML / HTML编码实体。如果只是将字符串转换为XML数据类型,则可以使用内置的解码函数。

看起来像这样:

select cast('Q &amp; A' as XML).value('.[1]','nvarchar(max)' );

将其变成易于使用的功能:

create function dbo.xmlDecode (@string nvarchar(max))
returns varchar(max)
begin
    return cast(@string as XML).value('.[1]','nvarchar(max)' )
end;

请记住,在OP的示例中,字符串似乎连续编码了3次。 &已转为&amp;,然后转为&amp;amp;,然后转入&amp;amp;amp;。因此,要获得“原始”字符串,您必须使用解码功能3次。

答案 2 :(得分:6)

以前的版本不适用于日本,韩国,...... 这里修正了版本:

GO
IF OBJECT_ID('dbo.fn_HTMLDecode') IS NOT NULL BEGIN DROP FUNCTION dbo.fn_HTMLDecode END
GO
CREATE FUNCTION dbo.fn_HTMLDecode(
    @vcWhat NVARCHAR(MAX)
    ,@toDecodeMainISOSymbols bit = 1
    ,@toDecodeISOChars bit = 1
)
RETURNS NVARCHAR(MAX)
AS
BEGIN
    DECLARE @vcResult NVARCHAR(MAX);
    DECLARE @siPos INT ,@vcEncoded NVARCHAR(9) ,@siChar INT;
    SET @vcResult = RTRIM(LTRIM(CAST(REPLACE(@vcWhat COLLATE Latin1_General_BIN, CHAR(0), '') AS NVARCHAR(MAX))));
    SELECT @vcResult = REPLACE(REPLACE(@vcResult, '&#160;', ' '), '&nbsp;', ' ');
    IF @vcResult = '' RETURN @vcResult;

    declare @s varchar(35);
    declare @n int; set @n = 6;
    declare @i int;

    while @n > 2
    begin
        set @s = '';
        set @i=1;
        while @i<=@n
        begin
            set @s = @s + '[0-9]';
            set @i = @i + 1;
        end
        set @s = '%&#' + @s + '%';
        SELECT @siPos = PATINDEX(@s, @vcResult);
        WHILE @siPos > 0
        BEGIN
            SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, @n+3)
                ,@siChar = CAST(SUBSTRING(@vcEncoded, 3, @n) AS INT)
                ,@vcResult = REPLACE(@vcResult, @vcEncoded, NCHAR(@siChar))
                ,@siPos = PATINDEX(@s, @vcResult);
        END
        set @n = @n - 1;
    end

    if @toDecodeMainISOSymbols=1
    begin
        select @vcResult = REPLACE(REPLACE(@vcResult, NCHAR(160), ' '), CHAR(160), ' ');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult, '&amp;', '&'), '&quot;', '"'), '&lt;', '<'), '&gt;', '>'), '&amp;amp;', '&'),'&rdquo;','”'),'&bdquo;','„'),'&ndash;','–'),'&mdash;','—');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult,'&lsquo;','‘'),'&rsquo;','’'),'&bull;','•'),'&hellip;','…'),'&permil;','‰') COLLATE Latin1_General_BIN,'&prime;','′') COLLATE Latin1_General_BIN,'&Prime;','″'),'&circ;','ˆ'),'&tilde;','˜'),'&nbsp;',' ');
    end

    if @toDecodeISOChars=1
    begin
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'&Scaron;','Š') COLLATE Latin1_General_BIN,'&scaron;','š') COLLATE Latin1_General_BIN,'&Ccedil;','Ç') COLLATE Latin1_General_BIN,'&ccedil;','ç');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult,'&Agrave;','À') COLLATE Latin1_General_BIN,'&agrave;','à') COLLATE Latin1_General_BIN,'&Aacute;','Á') COLLATE Latin1_General_BIN,'&aacute;','á') COLLATE Latin1_General_BIN,'&Acirc;','Â') COLLATE Latin1_General_BIN,'&acirc;','â') COLLATE Latin1_General_BIN,'&Atilde;','Ã') COLLATE Latin1_General_BIN,'&atilde;','ã') COLLATE Latin1_General_BIN,'&Auml;','Ä') COLLATE Latin1_General_BIN,'&auml;','ä') COLLATE Latin1_General_BIN,'&Aring;','Å') COLLATE Latin1_General_BIN,'&aring;','å') COLLATE Latin1_General_BIN,'&AElig;','Æ') COLLATE Latin1_General_BIN,'&aelig;','æ');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'&Egrave','È') COLLATE Latin1_General_BIN,'&egrave','è') COLLATE Latin1_General_BIN,'&Eacute;','É') COLLATE Latin1_General_BIN,'&eacute;','é') COLLATE Latin1_General_BIN,'&Ecirc;','Ê') COLLATE Latin1_General_BIN,'&ecirc;','ê') COLLATE Latin1_General_BIN,'&Euml;','Ë') COLLATE Latin1_General_BIN,'&euml;','ë');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'&Igrave;','Ì') COLLATE Latin1_General_BIN,'&igrave;','ì') COLLATE Latin1_General_BIN,'&Iacute;','Í') COLLATE Latin1_General_BIN,'&iacute;','í') COLLATE Latin1_General_BIN,'&Icirc;','Î') COLLATE Latin1_General_BIN,'&icirc;','î') COLLATE Latin1_General_BIN,'&Iuml;','Ï') COLLATE Latin1_General_BIN,'&iuml;','ï');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'&Ograve;','Ò') COLLATE Latin1_General_BIN,'&ograve;','ò') COLLATE Latin1_General_BIN,'&Oacute;','Ó') COLLATE Latin1_General_BIN,'&oacute;','ó') COLLATE Latin1_General_BIN,'&Ocirc;','Ô') COLLATE Latin1_General_BIN,'&ocirc;','ô') COLLATE Latin1_General_BIN,'&Otilde;','Õ') COLLATE Latin1_General_BIN,'&otilde;','õ') COLLATE Latin1_General_BIN,'&Ouml;','Ö') COLLATE Latin1_General_BIN,'&ouml;','ö') COLLATE Latin1_General_BIN,'&Oslash','Ø') COLLATE Latin1_General_BIN,'&oslash','ø');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'&Ugrave;','Ù') COLLATE Latin1_General_BIN,'&ugrave;','ù') COLLATE Latin1_General_BIN,'&Uacute;','Ú') COLLATE Latin1_General_BIN,'&uacute;','ú') COLLATE Latin1_General_BIN,'&Ucirc;','Û') COLLATE Latin1_General_BIN,'&ucirc;','û') COLLATE Latin1_General_BIN,'&Uuml;','Ü') COLLATE Latin1_General_BIN,'&uuml;','ü');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'&ETH;','Ð') COLLATE Latin1_General_BIN,'&eth;','ð') COLLATE Latin1_General_BIN,'&Ntilde;','Ñ') COLLATE Latin1_General_BIN,'&ntilde;','ñ') COLLATE Latin1_General_BIN,'&Yacute;','Ý') COLLATE Latin1_General_BIN,'&yacute;','ý') COLLATE Latin1_General_BIN,'&THORN;','Þ') COLLATE Latin1_General_BIN,'&thorn;','þ') COLLATE Latin1_General_BIN,'&szlig;','ß');
    end
    RETURN @vcResult;
END
-- test:
-- select dbo.fn_HTMLDecode(N'A fine example of man and nature co-existing is Slovenia&#8217;s ecological tourist farms.',1,1)
-- select dbo.fn_HTMLDecode(N'm0 &#50752;&#51064;&#48516;&#50556;&#50640;&#49436; m1 &#44032;&#51109; m2 &#50689;&#54693;&#47141; m3&#51080;&#45716;m10',1,1)

答案 3 :(得分:1)

使用@Wouter中的转换方法处理十进制十六进制实体,而接受的答案则不能。

但是,还需要一个处理程序来处理输入中的XML特殊字符。

XML特殊字符:<, >, ", ', &

create function [dbo].[XMLdecode] (@input nvarchar(max))
returns nvarchar(max)
begin

    declare @output nvarchar(max) = ''
    declare @next nchar(1)
    declare @endIdx int = 0
    declare @idx int = 0
    while @idx < len(@input)
    begin

        set @idx += 1
        set @next = substring(@input, @idx, 1)
        set @endIdx = charindex(';', @input, @idx) - @idx

        if @next = '&' and (@endIdx > 8 or @endIdx < 1) 
            set @output += '&amp;'
        else if @next = '&' and @endIdx > 1 and @endIdx < 8
        begin
            set @output += lower(substring(@input,@idx,@endIdx+1))
            set @idx += @endIdx
        end
        else
            set @output += @next
    end

    set @output = replace(@output,'<','&lt;')
    set @output = replace(@output,'>','&gt;')
    set @output = replace(@output,'"','&quot;')
    set @output = replace(@output,'''','&apos;')

    return cast(@output as XML).value('.[1]','nvarchar(max)')

end;

用法示例:

select dbo.XMLdecode('this is a t&#xc9;st &#xba;f HEX &amp; DECIMAL &#8364;ntities & <<<< non-entities too! &#9745;')
------------------------------------------------------------------------------
returns: 'this is a tÉst ºf HEX & DECIMAL €ntities & <<<< non-entities too! ☑'

它不是防弹的,但处理了我所有的案件。

看起来像HTML实体但实际上并非真正的东西仍然会引起麻烦。例如:&#x2;甚至&;

它也不会处理非XML命名实体,例如&check;&hearts;