SQL用户定义函数用于剥离HTML标记并替换HTML实体

时间:2015-07-10 22:56:47

标签: sql sql-server sql-server-2008

我正在尝试编写一个UDF(实际上我正在将我在网络上找到的一些代码改编成一个函数)来执行标题中描述的内容。

以下是代码:

declare @txt varchar(max), @start int, @end int, @len int

set @txt = '<p class=&#34;answer&#34;>Informamos que a documenta&ccedil;&atilde;o <strong>deve ser impressa e enviada fisicamente pela AG&Ecirc;NCIA</strong>, contendo confere com oringinal por funcion&aacute;rio CAIXA.</p>'
set @start = charindex('<',@txt)
set @end = charindex('>',@txt,@start)
set @len = (@end - @start) + 1

while @start > 0 and @end > 0 and @len > 0
begin
    set @txt = stuff(@txt,@start,@len,'')
    set @start = charindex('<',@txt)
    set @end = charindex('>',@txt,@start)
    set @len = (@end - @start) + 1
end

SET @txt = REPLACE(@txt,'&nbsp;',' ') --space
SET @txt = REPLACE(@txt,'&ldquo;',CHAR(34)) --"
SET @txt = REPLACE(@txt,'&rdquo;',CHAR(34)) --"
SET @txt = REPLACE(@txt,'&lsquo;',CHAR(39)) --'
SET @txt = REPLACE(@txt,'&rsquo;',CHAR(39)) --'
SET @txt = REPLACE(@txt,'&ndash;',CHAR(150)) -- –
SET @txt = REPLACE(@txt,'&mdash;',CHAR(151)) -- —
SET @txt = REPLACE(@txt,'&ordm;',CHAR(186)) -- º
SET @txt = REPLACE(@txt,'&ordf;',CHAR(170)) -- ª
SET @txt = REPLACE(@txt,'&sect;',CHAR(167)) -- §
--------------------------------------------------------------
SET @txt = REPLACE(@txt,'&#34;',CHAR(34)) --"
SET @txt = REPLACE(@txt,'&#39;',CHAR(39)) --'
--------------------------------------------------------------

SET @txt = REPLACE(@txt,'&agrave;','à') --à
SET @txt = REPLACE(@txt,'&aacute;','á') --á
SET @txt = REPLACE(@txt,'&atilde;','ã') --ã
SET @txt = REPLACE(@txt,'&acirc;','â') --â
SET @txt = REPLACE(@txt,'&auml;','ä') --ä
SET @txt = REPLACE(@txt,'&eacute;','é') --é
SET @txt = REPLACE(@txt,'&ecirc;','ê') --ê
SET @txt = REPLACE(@txt,'&iacute;','í') --í
SET @txt = REPLACE(@txt,'&oacute;','ó') --ó
SET @txt = REPLACE(@txt,'&otilde;','õ') --õ     
SET @txt = REPLACE(@txt,'&oslash;','ø') --ø
SET @txt = REPLACE(@txt,'&uacute;','ú') --ú
SET @txt = REPLACE(@txt,'&uuml;','ü') --ü
SET @txt = REPLACE(@txt,'&ccedil;','ç') --ç
--------------------------------------------------------------
SET @txt = REPLACE(@txt,'&Agrave;',CHAR(192)) --À
SET @txt = REPLACE(@txt,'&Aacute;',CHAR(193)) --Á
SET @txt = REPLACE(@txt,'&Atilde;',CHAR(195)) --Ã
SET @txt = REPLACE(@txt,'&Acirc;',CHAR(194)) --Â
SET @txt = REPLACE(@txt,'&Auml;',CHAR(196)) --Ä
SET @txt = REPLACE(@txt,'&Eacute;',CHAR(201)) --É
SET @txt = REPLACE(@txt,'&Ecirc;',CHAR(202)) --Ê
SET @txt = REPLACE(@txt,'&Iacute;',CHAR(205)) --Í
SET @txt = REPLACE(@txt,'&Oacute;',CHAR(211)) --Ó
SET @txt = REPLACE(@txt,'&Otilde;',CHAR(213)) --Õ
SET @txt = REPLACE(@txt,'&Oslash;',CHAR(216)) --Ø
SET @txt = REPLACE(@txt,'&Uacute;',CHAR(218)) --Ú
SET @txt = REPLACE(@txt,'&Uuml;',CHAR(220)) --Ü
SET @txt = REPLACE(@txt,'&Ccedil;',CHAR(199)) --Ç

select LTRIM(RTRIM(@txt))

它正在剥离HTML标记,只转换为小写HTML实体,在单词AG &Ecirc; NCIA(AGÊNCIA)中找到大写如&Ecirc;时不起作用,而不是打印AGêNCIA。

有任何帮助使其正常工作吗?

编辑:PS:我无法更改我的数据库排序规则,正如@dzomba所建议的

1 个答案:

答案 0 :(得分:2)

如果您需要,则需要将db collat​​ion设置为区分大小写。

UPDATE!这是一种解决方法,但我认为这将完成这项工作。我没有一个SQL服务器来测试它,但我几乎可以肯定它的工作正常。

declare @txt varchar(max)
declare @start int
declare @end int
declare @len int

set @txt = '<p class=&#34;answer&#34;>Informamos que a documenta&ccedil;&atilde;o <strong>deve ser impressa e enviada fisicamente pela AG&Ecirc;NCIA</strong>, contendo confere com oringinal por funcion&aacute;rio CAIXA.</p>'
set @start = charindex('<',@txt)
set @end = charindex('>',@txt,@start)
set @len = (@end - @start) + 1

while @start > 0 and @end > 0 and @len > 0
begin
    set @txt = stuff(@txt,@start,@len,'')
    set @start = charindex('<',@txt)
    set @end = charindex('>',@txt,@start)
    set @len = (@end - @start) + 1
end

DECLARE @table (txtColumn varchar(max) COLLATE SQL_Latin1_General_CP1_CS_AS ) --make the column case sensitive


INSERT INTO @table (txtColumn)
SELECT @txt

UPDATE @table set txtColumn = REPLACE(txtColumn,'&nbsp;',' ') --space
UPDATE @table set txtColumn = REPLACE(txtColumn,'&ldquo;',CHAR(34)) --"
UPDATE @table set txtColumn = REPLACE(txtColumn,'&rdquo;',CHAR(34)) --"
UPDATE @table set txtColumn = REPLACE(txtColumn,'&lsquo;',CHAR(39)) --'
UPDATE @table set txtColumn = REPLACE(txtColumn,'&rsquo;',CHAR(39)) --'
UPDATE @table set txtColumn = REPLACE(txtColumn,'&ndash;',CHAR(150)) -- –
UPDATE @table set txtColumn = REPLACE(txtColumn,'&mdash;',CHAR(151)) -- —
UPDATE @table set txtColumn = REPLACE(txtColumn,'&ordm;',CHAR(186)) -- º
UPDATE @table set txtColumn = REPLACE(txtColumn,'&ordf;',CHAR(170)) -- ª
UPDATE @table set txtColumn = REPLACE(txtColumn,'&sect;',CHAR(167)) -- §
    -------------------------------------------------------------
UPDATE @table set txtColumn = REPLACE(txtColumn,'&#34;',CHAR(34)) --"
UPDATE @table set txtColumn = REPLACE(txtColumn,'&#39;',CHAR(39)) --'
    --------------------------------------------------------------

UPDATE @table set txtColumn = REPLACE(txtColumn,'&agrave;','à') --à
UPDATE @table set txtColumn = REPLACE(txtColumn,'&aacute;','á') --á
UPDATE @table set txtColumn = REPLACE(txtColumn,'&atilde;','ã') --ã
UPDATE @table set txtColumn = REPLACE(txtColumn,'&acirc;','â') --â
UPDATE @table set txtColumn = REPLACE(txtColumn,'&auml;','ä') --ä
UPDATE @table set txtColumn = REPLACE(txtColumn,'&eacute;','é') --é
UPDATE @table set txtColumn = REPLACE(txtColumn,'&ecirc;','ê') --ê
UPDATE @table set txtColumn = REPLACE(txtColumn,'&iacute;','í') --í
UPDATE @table set txtColumn = REPLACE(txtColumn,'&oacute;','ó') --ó
UPDATE @table set txtColumn = REPLACE(txtColumn,'&otilde;','õ') --õ     
UPDATE @table set txtColumn = REPLACE(txtColumn,'&oslash;','ø') --ø
UPDATE @table set txtColumn = REPLACE(txtColumn,'&uacute;','ú') --ú
UPDATE @table set txtColumn = REPLACE(txtColumn,'&uuml;','ü') --ü
UPDATE @table set txtColumn = REPLACE(txtColumn,'&ccedil;','ç') --ç
    -------------------------------------------------------------
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Agrave;',CHAR(192)) --À
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Aacute;',CHAR(193)) --Á
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Atilde;',CHAR(195)) --Ã
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Acirc;',CHAR(194)) --Â
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Auml;',CHAR(196)) --Ä
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Eacute;',CHAR(201)) --É
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Ecirc;',CHAR(202)) --Ê
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Iacute;',CHAR(205)) --Í
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Oacute;',CHAR(211)) --Ó
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Otilde;',CHAR(213)) --Õ
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Oslash;',CHAR(216)) --Ø
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Uacute;',CHAR(218)) --Ú
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Uuml;',CHAR(220)) --Ü
UPDATE @table set txtColumn = REPLACE(txtColumn,'&Ccedil;',CHAR(199)) --Ç

    SELECT LTRIM(RTRIM(txtColumn)) FROM @table

--- IN THE END DROP THE TABLE
    DROP TABLE @table