在Excel Vba中解析字符串

时间:2016-11-27 18:44:04

标签: regex vba excel-vba dictionary excel

我有一个宏向服务器发送XMLHTTP请求,它作为响应获得纯文本字符串,而不是JSON格式字符串或其他标准格式(至少对于我所知道的)。

我想解析输出字符串,以便以与此link

parseJson 子例程相同的方式访问结构化方法中的数据

我的问题是我对正则表达式不满意,而且我无法根据需要修改例程。

我需要解析的字符串具有以下结构:

  1. 字符串是单行
  2. 每个参数都由其参数名称定义为相等的simbol,其值以及以;结尾; " NID = 3;" " SID =测试;"
  3. 参数可以收集在"结构"以符号开始和结束并且他们的名字后面跟着他们;例如 | STEST; NID = 3; SID = Test; |
  4. 结构可以包含其他结构
  5. 输出字符串的示例如下

    |KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|
    

    在这种情况下,有一个宏结构 KC ,其中包含 AD 结构。结构 AD 由参数 PE PF 和2个结构 CD 组成。最后结构 CD 的参数 PE HP

    所以我想解析字符串以获得反映这种结构的 Object / Dictionary ,你能帮助我吗?

    在第一个答案后添加

    大家好,谢谢你的帮助,但我想我应该更清楚一点,我想得到的输出。 对于我的示例字符串,我希望有一个具有以下结构的对象:

    <KC>
        <AD>
            <PE>5</PE>
            <PF>3</PF>
            <CD>
                <PE>5</PE>
                <HP>test</HP>
            </CD>
            <CD>
                <PE>3</PE>
                <HP>abc</HP>
            </CD>
        </AD>
    </KC>
    

    所以我开始根据@Nvj的一些提示和link

    中的答案编写一个可能的工作代码库。
    Option Explicit
    Option Base 1
    
    Sub Test()
    
      Dim strContent As String
      Dim strState   As String
      Dim varOutput  As Variant
    
      strContent = "|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|"
      Call ParseString(strContent, varOutput, strState)
    
    End Sub
    
    Sub ParseString(ByVal strContent As String, varOutput As Variant, strState As String)
    ' strContent - source string
    ' varOutput - created object or array to be returned as result
    ' strState - Object|Array|Error depending on processing to be returned as state
    Dim objTokens As Object
    Dim lngTokenId As Long
    Dim objRegEx As Object
    Dim bMatched As Boolean
    
    Set objTokens = CreateObject("Scripting.Dictionary")
    lngTokenId = 0
    Set objRegEx = CreateObject("VBScript.RegExp")
    With objRegEx
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        .Pattern = "\|[A-Z]{2};"  'Pattern for the name of structures
        Tokenize objTokens, objRegEx, strContent, lngTokenId, bMatched, "str"
        .Pattern = "[A-Z]{2}=[^\|=;]+;" 'Pattern for parameters name and values
        Tokenize objTokens, objRegEx, strContent, lngTokenId, bMatched, "par"
    End With
    
    End Sub
    
    Sub Tokenize(objTokens, objRegEx, strContent, lngTokenId, bMatched, strType)
    Dim strKey        As String
    Dim strKeyPar     As String
    Dim strKeyVal     As String
    
    Dim strWork       As String
    Dim strPar        As String
    Dim strVal        As String
    Dim strLevel      As String
    
    Dim strRes        As String
    
    Dim lngCopyIndex  As Long
    Dim objMatch      As Object
    
    strRes = ""
    lngCopyIndex = 1
    With objRegEx
        For Each objMatch In .Execute(strContent)
            If strType = "str" Then
              bMatched = True
              With objMatch
                  strWork = Replace(.Value, "|", "")
                  strWork = Replace(strWork, ";", "")
                  strLevel = get_Level(strWork)
                  strKey = "<" & lngTokenId & strLevel & strType & ">"
                  objTokens(strKey) = strWork
                  strRes = strRes & Mid(strContent, lngCopyIndex, .FirstIndex - lngCopyIndex + 1) & strKey
                  lngCopyIndex = .FirstIndex + .Length + 1
              End With
              lngTokenId = lngTokenId + 1
            ElseIf strType = "par" Then
    
              strKeyPar = "<" & lngTokenId & "par>"
              strKeyVal = "<" & lngTokenId & "val>"
              strKey = strKeyPar & strKeyVal
              bMatched = True
              With objMatch
                  strWork = Replace(.Value, ";", "")
                  strPar = Split(strWork, "=")(0)
                  strVal = Split(strWork, "=")(1)
                  objTokens(strKeyPar) = strPar
                  objTokens(strKeyVal) = strVal
                  strRes = strRes & Mid(strContent, lngCopyIndex, .FirstIndex - lngCopyIndex + 1) & strKey
                  lngCopyIndex = .FirstIndex + .Length + 1
              End With
              lngTokenId = lngTokenId + 2
    
            End If
        Next
        strContent = strRes & Mid(strContent, lngCopyIndex, Len(strContent) - lngCopyIndex + 1)
    End With
    End Sub
    
    Function get_Level(strInput As String) As String
    
    Select Case strInput
      Case "KC"
      get_Level = "L1"
      Case "AD"
      get_Level = "L2"
      Case "CD"
      get_Level = "L3"
      Case Else
      MsgBox ("Error")
      End
    End Select
    
    End Function
    

    此函数创建一个字典,其中包含每个结构名称,参数名称和参数值的项目,如图所示 enter image description here 由于函数get_Level,与结构关联的项目具有一个有助于保留数据原始层次结构的级别。

    所以我缺少的是一个创建具有输入字符串原始结构的对象的函数。这是Retrieve函数在此答案link中执行的操作,但我不知道如何使其适应我的案例

3 个答案:

答案 0 :(得分:2)

这看起来像一个简单的嵌套分隔字符串。一些Split()函数可以解决这个问题:

Option Explicit

Function parseString(str As String) As Collection

    Dim a1() As String, i1 As Long, c1 As Collection
    Dim a2() As String, i2 As Long, c2 As Collection
    Dim a3() As String

    a1 = Split(str, "|")
    Set c1 = New Collection
    For i1 = LBound(a1) To UBound(a1)
        If a1(i1) <> "" Then
            Set c2 = New Collection
            a2 = Split(a1(i1), ";")
            For i2 = LBound(a2) To UBound(a2)
                If a2(i2) <> "" Then
                    a3 = Split(a2(i2), "=")
                    If UBound(a3) > 0 Then
                        c2.Add a3(1), a3(0)
                    ElseIf UBound(a3) = 0 Then
                        c2.Add a3(0)
                    End If
                End If
            Next i2
            c1.Add c2
        End If
    Next i1

    Set parseString = c1

End Function


Sub testParseString()

    Dim c As Collection

    Set c = parseString("|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|")

    Debug.Assert c(1)(1) = "KC"
    Debug.Assert c(2)("PE") = "5"
    Debug.Assert c(3)(1) = "CD"
    Debug.Assert c(4)("HP") = "abc"
    Debug.Assert c(4)(3) = "abc"  

End Sub

请注意,您可以通过索引和键(如果输入中存在键)来寻址值。如果未提供密钥,则只能通过其索引访问该值。您还可以递归迭代集合以获取树结构中的所有值。

深思熟虑:因为你的结构可能有重复的名字(在你的情况下“CD”结构发生两次)集合/字典会发现它存在问题(由于关键的碰撞)。解决此问题的另一个好方法是使用DOMDocument创建XML结构并使用XPath访问其元素。见Program with DOM in Visual Basic

UPDATE :我也在下面添加了XML示例。看看。

答案 1 :(得分:1)

这是使用DOMDocument XML解析器对字符串解析问题的另一种看法。您需要在VBA引用中包含Microsoft XML,v.6.0。

Function parseStringToDom(str As String) As DOMDocument60

    Dim a1() As String, i1 As Long
    Dim a2() As String, i2 As Long
    Dim a3() As String

    Dim dom As DOMDocument60
    Dim rt As IXMLDOMNode
    Dim nd As IXMLDOMNode

    Set dom = New DOMDocument60
    dom.async = False
    dom.validateOnParse = False
    dom.resolveExternals = False
    dom.preserveWhiteSpace = True

    Set rt = dom.createElement("root")
    dom.appendChild rt

    a1 = Split(str, "|")
    For i1 = LBound(a1) To UBound(a1)
        If a1(i1) <> "" Then
            a2 = Split(a1(i1), ";")
            Set nd = dom.createElement(a2(0))
            For i2 = LBound(a2) To UBound(a2)
                If a2(i2) <> "" Then
                    a3 = Split(a2(i2), "=")
                    If UBound(a3) > 0 Then
                        nd.appendChild dom.createElement(a3(0))
                        nd.LastChild.Text = a3(1)
                    End If
                End If
            Next i2
            rt.appendChild nd
        End If
    Next i1

    Set parseStringToDom = dom

End Function


Sub testParseStringToDom()

    Dim dom As DOMDocument60

    Set dom = parseStringToDom("|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|")

    Debug.Assert Not dom.SelectSingleNode("/root/KC") Is Nothing
    Debug.Assert dom.SelectSingleNode("/root/AD/PE").Text = "5"
    Debug.Assert dom.SelectSingleNode("/root/CD[1]/HP").Text = "test"
    Debug.Assert dom.SelectSingleNode("/root/CD[2]/HP").Text = "abc"

    Debug.Print dom.XML

End Sub

正如您所看到的,这会将您的文本转换为XML DOM文档,从而保留所有结构并允许重命名。然后,您可以使用XPath访问任何节点或值。这也可以扩展到具有更多嵌套级别和更多结构。

这是它在幕后创建的XML文档:

<root>
    <KC/>
    <AD>
        <PE>5</PE>
        <PF>3</PF>
    </AD>
    <CD>
        <PE>5</PE>
        <HP>test</HP>
    </CD>
    <CD>
        <PE>3</PE>
        <HP>abc</HP>
    </CD>
</root>

答案 2 :(得分:0)

我已经开始在VBA中为你指定的字符串结构编写一个解析器,并且它不完整,但我会发布它。也许你可以从中获取一些想法。

Sub ParseString()

    Dim str As String
    str = "|KC;|AD;PE=5;PF=3;|CD;PE=5;HP=test;|CD;PE=3;HP=abc;|"

    ' Declare an object dictionary
    ' Make a reference to Microsoft Scripting Runtime in order for this to work
    Dim dict As New Dictionary

    ' If the bars are present in the first and last character of the string, replace them
    str = Replace(str, "|", "", 1, 1)
    If (Mid(str, Len(str), 1) = "|") Then
        str = Mid(str, 1, Len(str) - 1)
    End If

    ' Split the string by bars
    Dim substring_array() As String
    substring_array = Split(str, "|")

    ' Declare a regex object
    ' Check the reference to Microsoft VBScript Regular Expressions 5.5 in order for this to work
    Dim regex As New RegExp
    With regex
        .Global = True
        .IgnoreCase = True
        .MultiLine = True
    End With

    ' Object to store the regex matches
    Dim matches As MatchCollection
    Dim param_name_matches As MatchCollection
    Dim parameter_value_matches As MatchCollection

    ' Define some regex patterns
    pattern_for_structure_name = "^[^=;]+;"
    pattern_for_parameters = "[^=;]+=[^=;]+;"
    pattern_for_parameter_name = "[^=;]="
    pattern_for_parameter_val = "[^=;];"

    ' Loop through the elements of the array
    Dim i As Integer
    For i = 0 To UBound(substring_array) - LBound(substring_array)

        ' Get the array element in a string
        str1 = substring_array(i)

        ' Check if it contains a structure name
        regex.Pattern = pattern_for_structure_name
        Set matches = regex.Execute(str1)

        If matches.Count = 0 Then

            ' This substring does not contain a structure name
            ' Check if it contains parameters
            regex.Pattern = pattern_for_parameter
            Set matches = regex.Execute(matches(0).Value)
            If matches.Count = 0 Then

                ' There are no parameters as well as no structure name
                ' This means the string had || - invalid string
                MsgBox ("Invalid string")

            Else

                ' The string contains parameter names
                ' Add each parameter name to the dictionary
                Dim my_match As match
                For Each my_match In matches

                    ' Get the name of the parameter
                    regex.Pattern = pattern_for_parameter_name
                    Set parameter_name_matches = regex.Execute(my_match.Value)

                    ' Check if the above returned any matches
                    If parameter_name_matches.Count = 1 Then

                        ' Remove = sign from the parameter name
                        parameter_name = Replace(parameter_name_matches(0).Value, "=", "")

                        ' Get the value of the parameter
                        regex.Pattern = pattern_for_parameter_value
                        Set parameter_value_matches = regex.Execute(my_match.Value)

                        ' Check if the above returned any matches
                        If parameter_value_matches.Count = 1 Then

                            ' Get the value
                            parameter_value = Replace(parameter_value_matches(0).Value, ";", "")

                            ' Add the parameter name and value as a key pair to the Dictionary object
                            dict.Item(parameter_name) = parameter_value

                        Else

                            ' Number of matches is either 0 or greater than 1 - in both cases the string is invalid
                            MsgBox ("Invalid string")

                        End If

                    Else

                        ' Parameter name did not match - invalid string
                        MsgBox ("Invalid string")

                    End If

                Next

            End If

        ElseIf matches.Count = 1 Then

            ' This substring contains a single structure name
            ' Check if it has parameter names

        Else

            ' This substring contains more than one structure name - the original string is invalid
            MsgBox ("Invalid string")

        End If

    Next i

End Sub