Excel VBA以PDF格式搜索文本并提取和命名页面

时间:2016-05-12 16:32:26

标签: excel vba excel-vba pdf

我有以下代码,它查看我的电子表格的A列中的每个单元格,在指定的PDF中搜索它找到的文本,然后提取它在文本中找到PDF格式的页面,并将其命名为电子表格单元格中的值。代码有效但速度很慢,我可能需要在PDF中搜索多达200个单词,这可能长达600页。有没有办法让代码更快?目前,它循环遍历每个单元格搜索遍历每个单词的每个页面,直到它在单元格中找到该单词。

    Sub test_with_PDF()

    Dim objApp As Object
    Dim objPDDoc As Object
    Dim objjso As Object
    Dim wordsCount As Long
    Dim page As Long
    Dim i As Long
    Dim strData As String
    Dim strFileName As String
    Dim lastrow As Long, c As Range
    Dim PageNos As Integer
    Dim newPDF As Acrobat.CAcroPDDoc
    Dim NewName As String
    Dim Folder As String
    lastrow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row

    strFileName = selectFile()
    Folder = GetFolder()

    Set objApp = CreateObject("AcroExch.App")
    Set objPDDoc = CreateObject("AcroExch.PDDoc")
    'AD.1 open file, if =false file is damage
    If objPDDoc.Open(strFileName) Then
        Set objjso = objPDDoc.GetJSObject

 PageNos = 0
 For Each c In Sheets("Sheet1").Range("A2:A" & lastrow)

        For page = 0 To objPDDoc.GetNumPages - 1
            wordsCount = objjso.GetPageNumWords(page)
            For i = 0 To wordsCount

                If InStr(1, c.Value, ", ") = 0 Then

                    If objjso.getPageNthWord(page, i) = c.Value Then
                        PageNos = PageNos + 1
                        If FileExist(Folder & "\" & c.Offset(0, 4) & ".pdf") Then

                                Set newPDF = CreateObject("AcroExch.pdDoc")
                                NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
                                newPDF.Open (NewName)
                                newPDF.InsertPages lngPages, objPDDoc, page, 1, 0
                                newPDF.Save 1, NewName
                                newPDF.Close
                                Set newPDF = Nothing
                                Exit For
                         Else
                                Set newPDF = CreateObject("AcroExch.PDDoc")
                                newPDF.Create
                                NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
                                newPDF.InsertPages -1, objPDDoc, page, 1, 0
                                newPDF.Save 1, NewName
                                newPDF.Close
                                Set newPDF = Nothing
                                Exit For

                        End If
                    End If
                Else

                If objjso.getPageNthWord(page, i) = c.Offset(0, 1).Value Then
                    If objjso.getPageNthWord(page, i + 1) = c.Offset(0, 2).Value Then
                        PageNos = PageNos + 1
                         If FileExist(Folder & "\" & c.Offset(0, 4) & ".pdf") Then

                                Set newPDF = CreateObject("AcroExch.pdDoc")
                                NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
                                newPDF.Open (NewName)
                                newPDF.InsertPages lngPages, objPDDoc, page, 1, 0
                                newPDF.Save 1, NewName
                                newPDF.Close
                                Set newPDF = Nothing
                                Exit For
                         Else
                                Set newPDF = CreateObject("AcroExch.PDDoc")
                                newPDF.Create
                                NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
                                newPDF.InsertPages -1, objPDDoc, page, 1, 0
                                newPDF.Save 1, NewName
                                newPDF.Close
                                Set newPDF = Nothing
                                Exit For

                        End If
                        Exit For
                    End If
                End If
            End If
            Next i
        Next page
        c.Offset(0, 3).Value = PageNos
        PageNos = 0
    Next c
    MsgBox "Done"
    Else
        MsgBox "error!"
    End If
End Sub

Function FileExist(path As String) As Boolean
    If Dir(path) <> vbNullString Then FileExist = True
End Function
Function selectFile()
Dim fd As FileDialog, fileName As String

On Error GoTo ErrorHandler

Set fd = Application.FileDialog(msoFileDialogFilePicker)

fd.AllowMultiSelect = False

If fd.Show = True Then
    If fd.SelectedItems(1) <> vbNullString Then
        fileName = fd.SelectedItems(1)
    End If
Else
    'Exit code if no file is selected
    End
End If

'Return Selected FileName
selectFile = fileName

Set fd = Nothing

Exit Function

ErrorHandler:
Set fd = Nothing
MsgBox "Error " & Err & ": " & Error(Err)

End Function
Function GetFolder() As String
    Dim fldr As FileDialog
    Dim sItem As String
    Set fldr = Application.FileDialog(msoFileDialogFolderPicker)
    With fldr
        .Title = "Select the Folder where you want you new PDFs to go"
        .AllowMultiSelect = False
        .InitialFileName = Application.DefaultFilePath
        If .Show <> -1 Then GoTo NextCode
        sItem = .SelectedItems(1)
    End With
NextCode:
    GetFolder = sItem
    Set fldr = Nothing
End Function

非常感谢提前。

3 个答案:

答案 0 :(得分:1)

对于某些事情,循环绝对是出色的,但是可以将处理与这些较高的查询联系在一起。最近,我和一位同事正在执行类似的任务(尽管与pdf无关),我们使用range.find方法代替在每个单元格上执行instr的循环取得了很大的成功。

一些景点: -为了模拟使用.find方法时的“循环单元”功能,我们以.cells结尾我们的range语句,如下所示:

activesheet.usedrange.cells.find()

所需的字符串在()中。

-返回值:“一个Range对象,代表找到该信息的第一个单元格。”

.find方法返回范围后,随后的子例程可以提取页码和文档名称。

-如果需要查找事件的第n个实例,“您可以使用FindNext和FindPrevious方法重复搜索。”(Microsoft)

Microsoft range.find概述: https://msdn.microsoft.com/en-us/vba/excel-vba/articles/range-find-method-excel

因此,使用这种方法,用户可以使用基于列表中单元格数量的循环对每个字符串执行.find方法。

缺点是(我认为)这必须在excel应用程序中的文本上完成;另外,我还没有测试过它是否可以确定字符串是否必须独自驻留在单元格中(我不认为这是个问题)。

‘==================

另一个可能有益的建议是,首先批量复制.pdf中的所有文本,并尽可能减少循环(在文档对象级别上的直接操作)。然后,您的查找/返回方法可以应用于批量文本。

从教授的PowerPoint创建学习笔记时,我进行了类似的活动;我将所有文本抓到一个.txt文件中,然后返回包含字符串列表实例的每个句子。

‘===================

一些警告:我承认我没有按照您的项目的规模执行解析,所以我的建议在实践中可能没有优势。

此外,我在解析.pdf文档方面没有做太多工作,因为我尝试首先选择.txt / excel应用程序,然后使用它。

祝您好运;我希望我至少能够提供思想上的帮助!

答案 1 :(得分:0)

很抱歉发布一个快速,不完整的答案,但我想我可以指出你的方向。

而不是让系统查找数百亿次的两个术语,然后进行数千亿次比较,将搜索术语放入数组中,并将每个页面的文本转换为长字符串。然后它只有每页查找一次和200次比较。

'Dim your Clipboard functions
Public Declare PtrSafe Function OpenClipboard Lib "user32" (ByVal hwnd As Long) As Long
Public Declare PtrSafe Function EmptyClipboard Lib "user32" () As Long
Public Declare PtrSafe Function CloseClipboard Lib "user32" () As Long

'...

Dim objData As New MSForms.DataObject
Dim arrSearch() As String
Dim strTxt As String

'...

'Create array of search terms
For i = 2 To lastrow
    arrSearch(i - 2) = Sheets("Sheet1").Cells(1, i)
Next i

For page = 0 To objPDDoc.GetNumPages - 1

    '[Move each page into a new document. You already have that code]

    'Clear clipboard
    OpenClipboard (0&)
    EmptyClipboard
    CloseClipboard

    'Copy page to clipboard
    objApp.MenuItemExecute ("SelectAll")
    objApp.MenuItemExecute ("Copy")
    'You can also do this with the JavaScript object: objjso.ExecMenuItem("Item Name")
    'You may have to insert a waiting function like sleep() here to wait for the action to complete

    'Put data from clipboard into a string.
    objData.GetFromClipboard
    strTxt = objData.GetText 'Now you can search the entire content of the page at once, within memory

    'Compare each element of the array to the string
    For i = LBound(arrSearch) To UBound(arrSearch)
        If InStr(1, strTxt, arrSearch(i)) > 0 Then
            '[You found a match. Your code here]
        End If
    Next i

Next page

这仍然很麻烦,因为您必须在新文档中打开每个页面。如果有一种很好的方法来确定您纯粹通过文本处理的页面(例如页面a底部的页码,紧接着是页面b顶部的页眉),那么您可能会看到复制将文档的整个文本分成一个字符串,然后使用文本中的线索确定找到匹配后要提取的页面。我相信这会快得多。

答案 2 :(得分:0)

Sub BatchRenameCS()

Dim objApp As Object
Dim objPDDoc As Object
Dim objjso As Object
Dim newPDF As Acrobat.CAcroPDDoc
Dim lastrow2 As Long
Dim strFileName As String
Dim Folder As String
Dim Page As Long
Dim Cell As Long
Dim PDFCharacterCount() As Long
Dim CharacterCount As Long
Dim i As Integer
Dim c As Integer
Dim x As Integer
Dim strSource As String
Dim strResult As String
Dim PDFCharacters As String
Dim PDFCharacters2 As String
Dim PDFPasteData() As String
Dim PasteDataPage As Integer
Dim LastRow As Long
Dim NewName As String
Dim NewNamePageNum As Integer
Dim Check()

Sheets("Sheet1").Range("C:D").ClearContents

strFileName = selectFile()
Folder = GetFolder()

'create array with pdf word count
Set objApp = CreateObject("AcroExch.App")
Set objPDDoc = CreateObject("AcroExch.PDDoc")
'AD.1 open file, if =false file is damage
    If objPDDoc.Open(strFileName) Then
        Set objjso = objPDDoc.GetJSObject

ReDim PDFCharacterCount(1 To objPDDoc.GetNumPages) As Long

For Page = 1 To objPDDoc.GetNumPages
PDFCharacters = ""
PDFCharacters2 = ""
    For c = 0 To objjso.GetPageNumWords(Page - 1)
    PDFCharacters = PDFCharacters & objjso.getPageNthWord(Page - 1, c)
    Next c
    For i = 1 To Len(PDFCharacters)
        Select Case Asc(Mid(PDFCharacters, i, 1))
            Case 48 To 57, 65 To 90, 97 To 122:
            PDFCharacters2 = PDFCharacters2 & (Mid(PDFCharacters, i, 1))
            Case Else
            PDFCharacters2 = PDFCharacters2 & ""
        End Select
    Next
    PDFCharacterCount(Page) = Len(PDFCharacters2)

Next Page

lastrow2 = Sheets("Sheet2").Cells(Rows.Count, "A").End(xlUp).Row
Page = 1
ReDim PDFPasteData(1 To objPDDoc.GetNumPages) As String
For Cell = 1 To lastrow2
    strResult = ""
    strSource = Sheets("Sheet2").Cells(Cell, 1).Text
    PDFPasteData(Page) = PDFPasteData(Page) & " " & strSource
    For i = 1 To Len(strSource)
        Select Case Asc(Mid(strSource, i, 1))
            Case 48 To 57, 65 To 90, 97 To 122:
            strResult = strResult & (Mid(strSource, i, 1))
            Case Else
            strResult = strResult & ""
        End Select
    Next

CharacterCount = CharacterCount + Len(strResult)

If CharacterCount = PDFCharacterCount(Page) Then
CharacterCount = 0
Page = Page + 1
End If

Next Cell
ReDim Check(2, objPDDoc.GetNumPages)
LastRow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row
For Each LookUpCell In Worksheets("Sheet1").Range("A2:A" & LastRow)
    For PasteDataPage = 1 To objPDDoc.GetNumPages
        If InStr(PDFPasteData(PasteDataPage), LookUpCell.Value) Then
        Check(1, PasteDataPage) = Check(1, PasteDataPage) + 1
        Check(2, PasteDataPage) = Check(2, PasteDataPage) & LookUpCell.Offset(0, 1).Value & Chr(10)
                                If FileExist(Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf") Then

                                        Set newPDF = CreateObject("AcroExch.pdDoc")
                                        NewName = Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf"
                                        newPDF.Open (NewName)
                                        newPDF.InsertPages newPDF.GetNumPages - 1, objPDDoc, PasteDataPage - 1, 1, 0
                                        newPDF.Save 1, NewName
                                        newPDF.Close
                                        Set newPDF = Nothing
                                 Else
                                        Set newPDF = CreateObject("AcroExch.PDDoc")
                                        newPDF.Create
                                        NewName = Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf"
                                        newPDF.InsertPages -1, objPDDoc, PasteDataPage - 1, 1, 0
                                        newPDF.Save 1, NewName
                                        newPDF.Close
                                        Set newPDF = Nothing
                                End If
        End If
    Next PasteDataPage
Next LookUpCell
x = 1
For PasteDataPage = 1 To objPDDoc.GetNumPages
    If Check(1, PasteDataPage) <> 1 Then
    Sheets("Sheet1").Cells(x, 3) = PasteDataPage
    Sheets("Sheet1").Cells(x, 4) = Check(2, PasteDataPage)
    x = x + 1
    End If
Next PasteDataPage
End If
MsgBox "Done"
End Sub
Function FileExist(path As String) As Boolean
    If Dir(path) <> vbNullString Then FileExist = True
End Function
Function selectFile()
Dim fd As FileDialog, fileName As String
On Error GoTo ErrorHandler
Set fd = Application.FileDialog(msoFileDialogFilePicker)
fd.AllowMultiSelect = False
If fd.Show = True Then
    If fd.SelectedItems(1) <> vbNullString Then
        fileName = fd.SelectedItems(1)
    End If
Else
    'Exit code if no file is selected
    End
End If
'Return Selected FileName
selectFile = fileName
Set fd = Nothing
Exit Function
ErrorHandler:
Set fd = Nothing
MsgBox "Error " & Err & ": " & Error(Err)
End Function
Function GetFolder() As String
    Dim fldr As FileDialog
    Dim sItem As String
    Set fldr = Application.FileDialog(msoFileDialogFolderPicker)
    With fldr
        .Title = "Select the Folder where you want you new PDFs to go"
        .AllowMultiSelect = False
        .InitialFileName = Application.DefaultFilePath
        If .Show <> -1 Then GoTo NextCode
        sItem = .SelectedItems(1)
    End With
NextCode:
    GetFolder = sItem
    Set fldr = Nothing

End Function