我需要从字符串中提取唯一的单词和数值。在这一点上,我有一个功能,剥离一切,只返回字母数字。我还需要识别一个单词确实是一个日期或数字,并防止文本被拆分。我怎么能这样做?
这是我目前拥有的分割器功能:
Public Function GetAlphaNumericWords(ByVal InputText As String) As Collection
' This function splits the rich text input into unique alpha-numeric only strings
Dim words() As String
Dim characters() As Byte
Dim text As Variant
Dim i As Long
Set GetAlphaNumericWords = New Collection
text = Trim(PlainText(InputText))
If Len(text) > 0 Then
' Replace any non alphanumeric characters with a space
characters = StrConv(text, vbFromUnicode)
For i = LBound(characters) To UBound(characters)
If Not (Chr(characters(i)) Like "[A-Za-z0-9 ]") Then
characters(i) = 32 ' Space character
End If
Next
' Merge the byte array back to a string and then split on spaces
words = VBA.Split(StrConv(characters, vbUnicode))
' Add each unique word to the output collection
On Error Resume Next
For Each text In words
If (text <> vbNullString) Then GetAlphaNumericWords.Add CStr(text), CStr(text)
If Err Then Err.Clear
Next
End If
End Function
此函数当前返回的输出示例:
GetAlphaNumericWords("Hello World! Test 1. 123.45 8/22/2013 August 22, 2013")
Hello
World
Test
1
123
45
8
22
2013
August
我真正想要的是:
Hello
World
Test
1
123.45
8/22/2013
答案 0 :(得分:3)
当你可以使用正则表达式时,这似乎很多工作。请参阅here和here以获得良好的起点。
如果您添加对“Microsoft VBScript正则表达式5.5”的引用并添加以下函数(我已经包含了一些不必要的函数,以防它们在其他地方有用):
Public Function RegEx(strInput As String, strRegEx As String, Optional bIgnoreCase As Boolean = True, Optional bMultiLine As Boolean = False) As Boolean
Dim RegExp As VBScript_RegExp_55.RegExp
Set RegExp = New VBScript_RegExp_55.RegExp
With RegExp
.MultiLine = bMultiLine
.IgnoreCase = bIgnoreCase
.Pattern = strRegEx
End With
RegEx = RegExp.test(strInput)
Set RegExp = Nothing
End Function
Public Function RegExMatch(strInput As String, strRegEx As String, Optional MatchNo As Long = 0, Optional FirstIDX As Long, Optional Lgth As Long, Optional bIgnoreCase As Boolean = True, Optional bMultiLine As Boolean = False) As String
Dim RegExp As VBScript_RegExp_55.RegExp, Matches As VBScript_RegExp_55.MatchCollection
Set RegExp = New VBScript_RegExp_55.RegExp
With RegExp
.Global = True
.MultiLine = bMultiLine
.IgnoreCase = bIgnoreCase
.Pattern = strRegEx
End With
If RegExp.test(strInput) Then
Set Matches = RegExp.Execute(strInput)
If MatchNo > Matches.Count - 1 Then
RegExMatch = ""
Else
RegExMatch = Matches(MatchNo).value
FirstIDX = Matches(MatchNo).FirstIndex
Lgth = Matches(MatchNo).Length
End If
Else
RegExMatch = ""
End If
Set RegExp = Nothing
End Function
Public Function RegexMatches(strInput As String, strRegEx As String, Optional bIgnoreCase As Boolean = True, Optional bMultiLine As Boolean = False) As VBScript_RegExp_55.MatchCollection
Dim RegExp As VBScript_RegExp_55.RegExp
Set RegExp = New VBScript_RegExp_55.RegExp
With RegExp
.Global = True
.MultiLine = bMultiLine
.IgnoreCase = bIgnoreCase
.Pattern = strRegEx
End With
Set RegexMatches = RegExp.Execute(strInput)
Set RegExp = Nothing
End Function
Public Function RegExReplace(strInput As String, strRegEx As String, strReplace As String, Optional bGlobal As Boolean = True, Optional bIgnoreCase As Boolean = True, Optional bMultiLine As Boolean = False) As String
Dim RegExp As VBScript_RegExp_55.RegExp
Set RegExp = New VBScript_RegExp_55.RegExp
With RegExp
.MultiLine = bMultiLine
.IgnoreCase = bIgnoreCase
.Pattern = strRegEx
.Global = bGlobal
End With
RegExReplace = RegExp.Replace(strInput, strReplace)
Set RegExp = Nothing
End Function
您应该能够使用它们来制作更实用,更优雅的解决方案。
您应该考虑类似于以下的正则表达式模式:
\b(\w+)\b
和类似以下的代码 - 对于每场比赛&amp;使用RegexMatches
的子匹配,在其上尝试CDec
和CDate
,如果不得到错误,则拒绝它(没有错误会表明合法的日期或号码):
Dim Matches As VBScript_RegExp_55.MatchCollection
...
Set Matches = RegexMatches(InputText , "\b(\w+)\b")
If Matches.Count > 0 Then
For CtrA = 0 To Matches.Count - 1
For CtrB = 0 To Matches(CtrA).SubMatches.Count - 1
On Error Resume Next
TestVariant = Null
TestVariant = CDec(Matches(CtrA).Submatches(CtrB))
TestVariant = CDate(Matches(CtrA).Submatches(CtrB))
On Error Goto 0
If IsNull(TestVariant) Then
' Do further processing to check if the submatch can be split on non-alphanumeric characters...
Else
GetAlphaNumericWords.Add Matches(CtrA).Submatches(CtrB), Matches(CtrA).Submatches(CtrB)
End If
Next
Next
End If