解析与HTML,单词,数字和日期混合的字符串

时间:2013-08-22 20:41:53

标签: vba

我需要从字符串中提取唯一的单词和数值。在这一点上,我有一个功能,剥离一切,只返回字母数字。我还需要识别一个单词确实是一个日期或数字,并防止文本被拆分。我怎么能这样做?

这是我目前拥有的分割器功能:

Public Function GetAlphaNumericWords(ByVal InputText As String) As Collection
' This function splits the rich text input into unique alpha-numeric only strings
    Dim words() As String
    Dim characters() As Byte
    Dim text As Variant
    Dim i As Long

    Set GetAlphaNumericWords = New Collection

    text = Trim(PlainText(InputText))
    If Len(text) > 0 Then
    ' Replace any non alphanumeric characters with a space
        characters = StrConv(text, vbFromUnicode)
        For i = LBound(characters) To UBound(characters)
            If Not (Chr(characters(i)) Like "[A-Za-z0-9 ]") Then
                characters(i) = 32 ' Space character
            End If
        Next
        ' Merge the byte array back to a string and then split on spaces
        words = VBA.Split(StrConv(characters, vbUnicode))

        ' Add each unique word to the output collection
        On Error Resume Next
        For Each text In words
            If (text <> vbNullString) Then GetAlphaNumericWords.Add CStr(text), CStr(text)
            If Err Then Err.Clear
        Next
    End If
End Function

此函数当前返回的输出示例:

GetAlphaNumericWords("Hello World!  Test 1. 123.45 8/22/2013 August 22, 2013")

Hello
World
Test
1
123
45
8
22
2013
August

我真正想要的是:

Hello
World
Test
1
123.45
8/22/2013

1 个答案:

答案 0 :(得分:3)

当你可以使用正则表达式时,这似乎很多工作。请参阅herehere以获得良好的起点。

如果您添加对“Microsoft VBScript正则表达式5.5”的引用并添加以下函数(我已经包含了一些不必要的函数,以防它们在其他地方有用):

Public Function RegEx(strInput As String, strRegEx As String, Optional bIgnoreCase As Boolean = True, Optional bMultiLine As Boolean = False) As Boolean
    Dim RegExp As VBScript_RegExp_55.RegExp
    Set RegExp = New VBScript_RegExp_55.RegExp
    With RegExp
        .MultiLine = bMultiLine
        .IgnoreCase = bIgnoreCase
        .Pattern = strRegEx
    End With
    RegEx = RegExp.test(strInput)
    Set RegExp = Nothing
End Function

Public Function RegExMatch(strInput As String, strRegEx As String, Optional MatchNo As Long = 0, Optional FirstIDX As Long, Optional Lgth As Long, Optional bIgnoreCase As Boolean = True, Optional bMultiLine As Boolean = False) As String
    Dim RegExp As VBScript_RegExp_55.RegExp, Matches As VBScript_RegExp_55.MatchCollection
    Set RegExp = New VBScript_RegExp_55.RegExp
    With RegExp
        .Global = True
        .MultiLine = bMultiLine
        .IgnoreCase = bIgnoreCase
        .Pattern = strRegEx
    End With
    If RegExp.test(strInput) Then
        Set Matches = RegExp.Execute(strInput)
        If MatchNo > Matches.Count - 1 Then
            RegExMatch = ""
        Else
            RegExMatch = Matches(MatchNo).value
            FirstIDX = Matches(MatchNo).FirstIndex
            Lgth = Matches(MatchNo).Length
        End If
    Else
        RegExMatch = ""
    End If
    Set RegExp = Nothing
End Function

Public Function RegexMatches(strInput As String, strRegEx As String, Optional bIgnoreCase As Boolean = True, Optional bMultiLine As Boolean = False) As VBScript_RegExp_55.MatchCollection
    Dim RegExp As VBScript_RegExp_55.RegExp
    Set RegExp = New VBScript_RegExp_55.RegExp
    With RegExp
        .Global = True
        .MultiLine = bMultiLine
        .IgnoreCase = bIgnoreCase
        .Pattern = strRegEx
    End With
    Set RegexMatches = RegExp.Execute(strInput)
    Set RegExp = Nothing
End Function

Public Function RegExReplace(strInput As String, strRegEx As String, strReplace As String, Optional bGlobal As Boolean = True, Optional bIgnoreCase As Boolean = True, Optional bMultiLine As Boolean = False) As String
    Dim RegExp As VBScript_RegExp_55.RegExp
    Set RegExp = New VBScript_RegExp_55.RegExp
    With RegExp
        .MultiLine = bMultiLine
        .IgnoreCase = bIgnoreCase
        .Pattern = strRegEx
        .Global = bGlobal
    End With
    RegExReplace = RegExp.Replace(strInput, strReplace)
    Set RegExp = Nothing
End Function

您应该能够使用它们来制作更实用,更优雅的解决方案。

您应该考虑类似于以下的正则表达式模式:

\b(\w+)\b

和类似以下的代码 - 对于每场比赛&amp;使用RegexMatches的子匹配,在其上尝试CDecCDate,如果得到错误,则拒绝它(没有错误会表明合法的日期或号码):

Dim Matches As VBScript_RegExp_55.MatchCollection
...
Set Matches = RegexMatches(InputText , "\b(\w+)\b")
                If Matches.Count > 0 Then
                    For CtrA = 0 To Matches.Count - 1
                        For CtrB = 0 To Matches(CtrA).SubMatches.Count - 1
                            On Error Resume Next
                            TestVariant = Null
                            TestVariant = CDec(Matches(CtrA).Submatches(CtrB))
                            TestVariant = CDate(Matches(CtrA).Submatches(CtrB))
                            On Error Goto 0
                            If IsNull(TestVariant) Then
                                ' Do further processing to check if the submatch can be split on non-alphanumeric characters... 
                            Else
                                GetAlphaNumericWords.Add Matches(CtrA).Submatches(CtrB), Matches(CtrA).Submatches(CtrB)
                            End If
                        Next
                    Next
                End If