我有约。 excel中包含RTF的12000个单元格(包括格式化标签)。我需要解析它们才能找到未格式化的文本。


{\fonttbl{\f0 \froman \fcharset0 Times New Roman;}{\f1 \fswiss \fcharset238
{\colortbl ;\red255\green255\blue255 ;\red0\green0\blue0 ;}
{\stylesheet{\fs24\cf2\cb1 Normal;}{\cs1\cf2\cb1 Default Paragraph Font;}}
\plain\plain\f1\fs24\pard TPR 0160 000\par IPR 0160 000\par OB-R-02-28\par}


TPR 0160 000
IPR 0160 000


但是,如果我将一个单元格的内容复制到空文本文档并将其保存为RTF,然后使用MS Word打开它,它会立即解析文本并得到我想要的内容。不幸的是,对12000个电池来说非常不方便。




如果您确实想要沿着使用Word解析文本的路线,此功能应该可以帮助您解决问题。正如评论所示,您需要引用MS Word对象库。

Function ParseRTF(strRTF As String) As String
Dim wdDoc As Word.Document 'Ref: Microsoft Word 11.0 Object Library'
Dim f     As Integer       'Variable to store the file I/O number'

'File path for a temporary .rtf file'
Const strFileTemp = "C:\TempFile_ParseRTF.rtf"

'Obtain the next valid file I/O number'
f = FreeFile

'Open the temp file and save the RTF string in it'
Open strFileTemp For Output As #f
    Print #f, strRTF
Close #f

'Open the .rtf file as a Word.Document'
Set wdDoc = GetObject(strFileTemp)

'Read the now parsed text from the Word.Document'
ParseRTF = wdDoc.Range.Text

'Delete the temporary .rtf file'
Kill strFileTemp

'Close the Word connection'
wdDoc.Close False
Set wdDoc = Nothing
End Function


Sub ParseAllRange()
Dim rngCell As Range
Dim strRTF  As String

For Each rngCell In Range("A1:A12000")

    'Parse the cell contents'
    strRTF = ParseRTF(CStr(rngCell))

    'Output to the cell one column over'
    rngCell.Offset(0, 1) = strRTF
End Sub





Private Declare Function GlobalAlloc Lib "kernel32" _
                (ByVal wFlags&, ByVal dwBytes As Long) As Long
Private Declare Function GlobalLock Lib "kernel32" _
                (ByVal hMem As Long) As Long
Private Declare Function GlobalUnlock Lib "kernel32" _
                (ByVal hMem As Long) As Long
Private Declare Function lstrcpy Lib "kernel32" _
                (ByVal lpString1 As Any, ByVal lpString2 As Any) As Long

Private Declare Function OpenClipboard Lib "user32" _
                (ByVal Hwnd As Long) As Long
Private Declare Function EmptyClipboard Lib "user32" () As Long
Private Declare Function RegisterClipboardFormat Lib "user32" Alias _
                "RegisterClipboardFormatA" (ByVal lpString As String) As Long
Private Declare Function SetClipboardData Lib "user32" _
                (ByVal wFormat As Long, ByVal hMem As Long) As Long
Private Declare Function CloseClipboard Lib "user32" () As Long


Dim wdDoc As Word.Document 'Ref: Microsoft Word 11.0 Object Library'

Private Sub Class_Initialize()
Set wdDoc = New Word.Document
End Sub

Private Sub Class_Terminate()
wdDoc.Close False
Set wdDoc = Nothing
End Sub


Private Function CopyRTF(strCopyString As String) As Boolean
Dim hGlobalMemory  As Long
Dim lpGlobalMemory As Long
Dim hClipMemory    As Long
Dim lngFormatRTF   As Long

'Allocate and copy string to memory'
hGlobalMemory = GlobalAlloc(&H42, Len(strCopyString) + 1)
lpGlobalMemory = GlobalLock(hGlobalMemory)
lpGlobalMemory = lstrcpy(lpGlobalMemory, strCopyString)

'Unlock the memory and then copy to the clipboard'
If GlobalUnlock(hGlobalMemory) = 0 Then
    If OpenClipboard(0&) <> 0 Then
        Call EmptyClipboard

        'Save the data as Rich Text Format'
        lngFormatRTF = RegisterClipboardFormat("Rich Text Format")
        hClipMemory = SetClipboardData(lngFormatRTF, hGlobalMemory)

        CopyRTF = CBool(CloseClipboard)
    End If
End If
End Function


Private Function PasteRTF() As String
Dim strOutput As String

'Paste the clipboard data to the wdDoc and read the plain text result'
strOutput = wdDoc.Range.Text

'Get rid of the new lines at the beginning and end of the document'
strOutput = Left(strOutput, Len(strOutput) - 2)
strOutput = Right(strOutput, Len(strOutput) - 2)

PasteRTF = strOutput
End Function


Public Function ParseRTF(strRTF As String) As String
If CopyRTF(strRTF) Then
    ParseRTF = PasteRTF
    ParseRTF = "Error in copying to clipboard"
End If
End Function


Sub CopyParseAllRange()
Dim rngCell As Range
Dim strRTF  As String

'Create new instance of clsRTFParser'
Dim RTFParser As clsRTFParser
Set RTFParser = New clsRTFParser

For Each rngCell In Range("A1:A12000")

    'Parse the cell contents'
    strRTF = RTFParser.ParseRTF(CStr(rngCell))

    'Output to the cell one column over'
    rngCell.Offset(0, 1) = strRTF
End Sub


每个RTF控制代码以“\”开头,以空格结尾,两者之间没有任何额外空格。 “{}”用于分组。如果您的文本不包含任何内容,则可以删除它们(“;”相同)。所以现在你继续使用你的文本和一些不必要的单词作为“Arial”,“Normal”等。你也可以构建字典来删除它们。经过一些调整后,您将只使用所需的文本。

请查看http://www.regular-expressions.info/以获取更多信息以及编写RegExp的绝佳工具(RegexBuddy - 遗憾的是它不是免费的,但它值得花钱.AFAIR还有试用版)。

更新:当然,我不鼓励你为每个细胞手动完成。只需迭代活动范围: 参考这个帖子: SO: About iterating through cells in VBA


Sub Iterate()
   For Each Cell in ActiveSheet.UsedRange.Cells
      'Do something
End Sub


参见: Regex functions in ExcelRegex in VBA


此处的一些解决方案需要引用MS Word对象库。玩我所处理的卡片,我找到了一个不依赖它的解决方案。它在VBA中剥离了RTF标签以及其他类似绒毛的字体表和样式表。它可能对你有所帮助。我在你的数据中运行它,除了空白之外,我得到的输出与你预期的相同。



Public Function Alphanumeric(Character As String) As Boolean
   If InStr("ABCDEFGHIJKKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-", Character) Then
       Alphanumeric = True
       Alphanumeric = False
   End If
End Function


Public Function RemoveGroup(RTFString As String, GroupName As String) As String
    Dim I As Integer
    Dim J As Integer
    Dim Count As Integer

    I = InStr(RTFString, "{\" & GroupName)

    ' If the group was not found in the RTF string, then just return that string unchanged.
    If I = 0 Then
        RemoveGroup = RTFString
        Exit Function
    End If

    ' Otherwise, we will need to scan along, from the start of the group, until we find the end of the group.
    ' The group is delimited by { and }. Groups may be nested, so we need to count up if we encounter { and
    ' down if we encounter }. When that count reaches zero, then the end of the group has been found.
    J = I
        If Mid(RTFString, J, 1) = "{" Then Count = Count + 1
        If Mid(RTFString, J, 1) = "}" Then Count = Count - 1
        J = J + 1
    Loop While Count > 0

    RemoveGroup = Replace(RTFString, Mid(RTFString, I, J - I), "")

End Function


Public Function RemoveTags(RTFString As String) As String
    Dim L As Long
    Dim R As Long
    L = 1
    ' Search to the end of the string.
    While L < Len(RTFString)
        ' Append anything that's not a tag to the return value.
        While Mid(RTFString, L, 1) <> "\" And L < Len(RTFString)
            RemoveTags = RemoveTags & Mid(RTFString, L, 1)
            L = L + 1

        'Search to the end of the tag.
        R = L + 1
        While Alphanumeric(Mid(RTFString, R, 1)) And R < Len(RTFString)
            R = R + 1
        L = R
End Function


Public Function RemoveBraces(RTFString As String) As String
    RemoveBraces = Replace(RTFString, "{", "")
    RemoveBraces = Replace(RemoveBraces, "}", "")
End Function


Public Function RemoveTheFluff(RTFString As String) As String
    RemoveTheFluff = Replace(RTFString, vbCrLf, "")
    RemoveTheFluff = RemoveGroup(RemoveTheFluff, "fonttbl")
    RemoveTheFluff = RemoveGroup(RemoveTheFluff, "colortbl")
    RemoveTheFluff = RemoveGroup(RemoveTheFluff, "stylesheet")
    RemoveTheFluff = RemoveTags(RemoveBraces(RemoveTheFluff))
End Function


Solution using .Net Framework RichTextBox control

将每个单元格中的RTF转换为2行代码中的纯文本(在稍微系统配置之后获取正确的.tlb文件以允许引用.Net Framework)。将单元格值放在 rtfsample

Set miracle = New System_Windows_Forms.RichTextBox
With miracle
    .RTF = rtfText
    PlainText = .TEXT
End With