我想从数据集中过滤掉几个关键短语。不幸的是,到目前为止我唯一能够提出的算法是嵌套替换语句,例如:
SELECT
REPLACE(
REPLACE(FIELDNAME,'</u>','')
,'<u>','')
其中FIELDNAME是存储在表中的原始HTML代码。如你所见,这很可怕。有更好的想法吗?
答案 0 :(得分:3)
我认为在TSQL中没有更好的方法。
如果你在SQL层之上有另一个环境(例如asp.net),你可能会更幸运地进行过滤。
答案 1 :(得分:1)
此类字符串操作最好由CLR scalar valued functions处理。
答案 2 :(得分:1)
Here是根据规则列表讨论类似但更复杂的“清理”HTML文本的问题(您需要在此站点注册,但就是这样)。讨论中包括几种使用T-SQL的方法,以及使用SQLCLR实现目标的最快方法。在我编写VB.Net/SQLCLR解决方案时,我将其包括在内。
以下是它实施的文本替换/转换:
它实现了DFSA的文本传感器类型(确定性有限状态自动机,嗯,它几乎是确定性的,因为它在几个地方看起来很先进):
Imports System
Imports System.Data
Imports System.Data.SqlClient
Imports System.Data.SqlTypes
Imports Microsoft.SqlServer.Server
Partial Public Class UserDefinedFunctions
Public Enum States
Space1
Entity
HTMLTag
Norm
Word1
Script
Style
End Enum
Enum SubStates
None
EndBegin
EndSlash
End Enum
Const CharSpace As Integer = 32
Const CharAmp As Integer = 38
Const CharSlash As Integer = 47
Const CharLT As Integer = 60
Const CharGT As Integer = 62
Const CharA As Integer = 65
Const CharX As Integer = 88
Const CharZ As Integer = 90
Const Char_a As Integer = 97
Const Char_b As Integer = 98
Const Char_n As Integer = 110
Const Char_p As Integer = 112
Const Char_s As Integer = 115
Const Char_z As Integer = 122
Const CharDash As Integer = 45
Const CharSemiC As Integer = 59
<Microsoft.SqlServer.Server.SqlFunction( _
DataAccess:=DataAccessKind.None _
, IsDeterministic:=True _
, IsPrecise:=True)> _
Public Shared Function HTMLCleaner(ByVal chars As SqlTypes.SqlBytes) As SqlTypes.SqlBytes
Dim b As Byte
Dim i As Integer, j As Integer
Dim Out As Byte()
Dim State As States = States.Space1
Dim Substate As SubStates = SubStates.None
Dim strAccum As String = ""
ReDim Out(0 To chars.Length - 1)
For i = 0 To chars.Length - 1
b = chars(i)
Select Case State
Case States.Norm
Select Case b
Case CharA To CharZ, Char_a To Char_z
Out(j) = b
j = j + 1
State = States.Norm
Case CharSpace
Out(j) = b
j = j + 1
State = States.Space1
Case CharAmp
State = States.Entity 'skip output'
Case CharLT
Out(j) = CharSpace
j = j + 1
State = States.HTMLTag
Case CharDash
Out(j) = b
j = j + 1
State = States.Norm
Case Else
State = States.Norm 'skip output'
End Select
Case States.Space1
Select Case b
Case CharSpace
'discard leading & multiple spaces'
Case CharAmp
State = States.Entity 'skip output'
Case CharLT
Out(j) = CharSpace
j = j + 1
State = States.HTMLTag
Case CharDash
Out(j) = b
j = j + 1
State = States.Norm
Case CharA To CharZ, Char_a To Char_z
Out(j) = b
j = j + 1
State = States.Word1
Case Else
State = States.Norm 'skip output'
End Select
Case States.Word1
Select Case b
Case CharSpace
'single char word, retract from output:'
j = j - 1
State = States.Space1
Case CharAmp
State = States.Entity 'skip output'
Case CharLT
Out(j) = CharSpace
j = j + 1
State = States.HTMLTag
Case CharDash
Out(j) = b
j = j + 1
State = States.Norm
Case CharA To CharZ, Char_a To Char_z
Out(j) = b
j = j + 1
State = States.Norm
Case Else
State = States.Norm 'skip output'
End Select
Case States.Entity
Select Case b
Case CharSemiC
'End of entity, wrap it up:'
If strAccum = "nbsp" Then
Out(j) = CharSpace
j = j + 1
strAccum = ""
State = States.Space1
Else
'output "X"'
Out(j) = CharX
j = j + 1
State = States.Norm
End If
Case Else
'else, keep scanning for semicolon...'
' accumulate entity chars:'
strAccum = strAccum & b
End Select
Case States.HTMLTag
If b = CharGT Then
If strAccum = "SCRIPT" Then
strAccum = ""
State = States.Script
Substate = SubStates.None
ElseIf strAccum = "STYLE" Then
strAccum = ""
State = States.Style
Substate = SubStates.None
Else
Out(j) = CharSpace
j = j + 1
State = States.Space1
strAccum = ""
End If
Else
'accumulate tag name'
strAccum = strAccum & b
End If
Case States.Script
Select Case Substate
Case SubStates.None
If b = CharGT Then
Substate = SubStates.EndBegin
End If
Case SubStates.EndBegin
If b = CharSlash Then
Substate = SubStates.EndSlash
strAccum = ""
Else
Substate = SubStates.None
End If
Case SubStates.EndSlash
If b = CharGT Then
If strAccum = "SCRIPT" Then
'end of script found; output nothing'
State = States.Norm
Substate = SubStates.None
Else
'false alarm, back to script-scanning'
Substate = SubStates.None
End If
Else
'accumulate the end-tags label'
strAccum = strAccum & b
End If
End Select
Case States.Style
Select Case Substate
Case SubStates.None
If b = CharGT Then
Substate = SubStates.EndBegin
End If
Case SubStates.EndBegin
If b = CharSlash Then
Substate = SubStates.EndSlash
strAccum = ""
Else
Substate = SubStates.None
End If
Case SubStates.EndSlash
If b = CharGT Then
If strAccum = "STYLE" Then
'end of script found; output nothing'
State = States.Norm
Substate = SubStates.None
Else
'false alarm, back to script-scanning'
Substate = SubStates.None
End If
Else
'accumulate the end-tags label'
strAccum = strAccum & b
End If
End Select
Case Else
Debug.Assert(1 = 0)
End Select
'extra check for multiple spaces'
If j > 1 _
AndAlso (Out(j - 1) = CharSpace _
And Out(j - 2) = CharSpace) Then
j = j - 1 'roll back the last character'
ElseIf j = 1 AndAlso Out(0) = CharSpace Then
j = 0 'overwrite leading space'
End If
Next
'remove any trailing space:'
If j > 0 AndAlso Out(j - 1) = CharSpace Then j = j - 1
'trim off the trailing excess'
ReDim Preserve Out(0 To j - 1)
Return New SqlBytes(Out)
End Function
<Microsoft.SqlServer.Server.SqlFunction( _
DataAccess:=DataAccessKind.None _
, IsDeterministic:=True _
, IsPrecise:=True)> _
Public Shared Function HTMLCopy2(ByVal chars As SqlTypes.SqlBytes) As SqlTypes.SqlBytes
Dim out() As Byte
ReDim out(0 To chars.Length - 1)
For i As Integer = 0 To chars.Length - 1
out(i) = chars.Buffer(i)
Next
Return New SqlBytes(out)
End Function
<Microsoft.SqlServer.Server.SqlFunction( _
DataAccess:=DataAccessKind.None _
, IsDeterministic:=True _
, IsPrecise:=True)> _
Public Shared Function HTMLCopy(ByVal chars As SqlTypes.SqlBytes) As SqlTypes.SqlBytes
' Add your code here'
Return New SqlTypes.SqlBytes(chars.Buffer)
End Function
End Class