我有以下代码,它查看我的电子表格的A列中的每个单元格,在指定的PDF中搜索它找到的文本,然后提取它在文本中找到PDF格式的页面,并将其命名为电子表格单元格中的值。代码有效但速度很慢,我可能需要在PDF中搜索多达200个单词,这可能长达600页。有没有办法让代码更快?目前,它循环遍历每个单元格搜索遍历每个单词的每个页面,直到它在单元格中找到该单词。
Sub test_with_PDF()
Dim objApp As Object
Dim objPDDoc As Object
Dim objjso As Object
Dim wordsCount As Long
Dim page As Long
Dim i As Long
Dim strData As String
Dim strFileName As String
Dim lastrow As Long, c As Range
Dim PageNos As Integer
Dim newPDF As Acrobat.CAcroPDDoc
Dim NewName As String
Dim Folder As String
lastrow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row
strFileName = selectFile()
Folder = GetFolder()
Set objApp = CreateObject("AcroExch.App")
Set objPDDoc = CreateObject("AcroExch.PDDoc")
'AD.1 open file, if =false file is damage
If objPDDoc.Open(strFileName) Then
Set objjso = objPDDoc.GetJSObject
PageNos = 0
For Each c In Sheets("Sheet1").Range("A2:A" & lastrow)
For page = 0 To objPDDoc.GetNumPages - 1
wordsCount = objjso.GetPageNumWords(page)
For i = 0 To wordsCount
If InStr(1, c.Value, ", ") = 0 Then
If objjso.getPageNthWord(page, i) = c.Value Then
PageNos = PageNos + 1
If FileExist(Folder & "\" & c.Offset(0, 4) & ".pdf") Then
Set newPDF = CreateObject("AcroExch.pdDoc")
NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
newPDF.Open (NewName)
newPDF.InsertPages lngPages, objPDDoc, page, 1, 0
newPDF.Save 1, NewName
newPDF.Close
Set newPDF = Nothing
Exit For
Else
Set newPDF = CreateObject("AcroExch.PDDoc")
newPDF.Create
NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
newPDF.InsertPages -1, objPDDoc, page, 1, 0
newPDF.Save 1, NewName
newPDF.Close
Set newPDF = Nothing
Exit For
End If
End If
Else
If objjso.getPageNthWord(page, i) = c.Offset(0, 1).Value Then
If objjso.getPageNthWord(page, i + 1) = c.Offset(0, 2).Value Then
PageNos = PageNos + 1
If FileExist(Folder & "\" & c.Offset(0, 4) & ".pdf") Then
Set newPDF = CreateObject("AcroExch.pdDoc")
NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
newPDF.Open (NewName)
newPDF.InsertPages lngPages, objPDDoc, page, 1, 0
newPDF.Save 1, NewName
newPDF.Close
Set newPDF = Nothing
Exit For
Else
Set newPDF = CreateObject("AcroExch.PDDoc")
newPDF.Create
NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
newPDF.InsertPages -1, objPDDoc, page, 1, 0
newPDF.Save 1, NewName
newPDF.Close
Set newPDF = Nothing
Exit For
End If
Exit For
End If
End If
End If
Next i
Next page
c.Offset(0, 3).Value = PageNos
PageNos = 0
Next c
MsgBox "Done"
Else
MsgBox "error!"
End If
End Sub
Function FileExist(path As String) As Boolean
If Dir(path) <> vbNullString Then FileExist = True
End Function
Function selectFile()
Dim fd As FileDialog, fileName As String
On Error GoTo ErrorHandler
Set fd = Application.FileDialog(msoFileDialogFilePicker)
fd.AllowMultiSelect = False
If fd.Show = True Then
If fd.SelectedItems(1) <> vbNullString Then
fileName = fd.SelectedItems(1)
End If
Else
'Exit code if no file is selected
End
End If
'Return Selected FileName
selectFile = fileName
Set fd = Nothing
Exit Function
ErrorHandler:
Set fd = Nothing
MsgBox "Error " & Err & ": " & Error(Err)
End Function
Function GetFolder() As String
Dim fldr As FileDialog
Dim sItem As String
Set fldr = Application.FileDialog(msoFileDialogFolderPicker)
With fldr
.Title = "Select the Folder where you want you new PDFs to go"
.AllowMultiSelect = False
.InitialFileName = Application.DefaultFilePath
If .Show <> -1 Then GoTo NextCode
sItem = .SelectedItems(1)
End With
NextCode:
GetFolder = sItem
Set fldr = Nothing
End Function
非常感谢提前。
答案 0 :(得分:1)
对于某些事情,循环绝对是出色的,但是可以将处理与这些较高的查询联系在一起。最近,我和一位同事正在执行类似的任务(尽管与pdf无关),我们使用range.find方法代替在每个单元格上执行instr的循环取得了很大的成功。
一些景点: -为了模拟使用.find方法时的“循环单元”功能,我们以.cells结尾我们的range语句,如下所示:
activesheet.usedrange.cells.find()
所需的字符串在()中。
-返回值:“一个Range对象,代表找到该信息的第一个单元格。”
.find方法返回范围后,随后的子例程可以提取页码和文档名称。
-如果需要查找事件的第n个实例,“您可以使用FindNext和FindPrevious方法重复搜索。”(Microsoft)
Microsoft range.find概述: https://msdn.microsoft.com/en-us/vba/excel-vba/articles/range-find-method-excel
因此,使用这种方法,用户可以使用基于列表中单元格数量的循环对每个字符串执行.find方法。
缺点是(我认为)这必须在excel应用程序中的文本上完成;另外,我还没有测试过它是否可以确定字符串是否必须独自驻留在单元格中(我不认为这是个问题)。
‘==================
另一个可能有益的建议是,首先批量复制.pdf中的所有文本,并尽可能减少循环(在文档对象级别上的直接操作)。然后,您的查找/返回方法可以应用于批量文本。
从教授的PowerPoint创建学习笔记时,我进行了类似的活动;我将所有文本抓到一个.txt文件中,然后返回包含字符串列表实例的每个句子。
‘===================
一些警告:我承认我没有按照您的项目的规模执行解析,所以我的建议在实践中可能没有优势。
此外,我在解析.pdf文档方面没有做太多工作,因为我尝试首先选择.txt / excel应用程序,然后使用它。
祝您好运;我希望我至少能够提供思想上的帮助!
答案 1 :(得分:0)
很抱歉发布一个快速,不完整的答案,但我想我可以指出你的方向。
而不是让系统查找数百亿次的两个术语,然后进行数千亿次比较,将搜索术语放入数组中,并将每个页面的文本转换为长字符串。然后它只有每页查找一次和200次比较。
'Dim your Clipboard functions
Public Declare PtrSafe Function OpenClipboard Lib "user32" (ByVal hwnd As Long) As Long
Public Declare PtrSafe Function EmptyClipboard Lib "user32" () As Long
Public Declare PtrSafe Function CloseClipboard Lib "user32" () As Long
'...
Dim objData As New MSForms.DataObject
Dim arrSearch() As String
Dim strTxt As String
'...
'Create array of search terms
For i = 2 To lastrow
arrSearch(i - 2) = Sheets("Sheet1").Cells(1, i)
Next i
For page = 0 To objPDDoc.GetNumPages - 1
'[Move each page into a new document. You already have that code]
'Clear clipboard
OpenClipboard (0&)
EmptyClipboard
CloseClipboard
'Copy page to clipboard
objApp.MenuItemExecute ("SelectAll")
objApp.MenuItemExecute ("Copy")
'You can also do this with the JavaScript object: objjso.ExecMenuItem("Item Name")
'You may have to insert a waiting function like sleep() here to wait for the action to complete
'Put data from clipboard into a string.
objData.GetFromClipboard
strTxt = objData.GetText 'Now you can search the entire content of the page at once, within memory
'Compare each element of the array to the string
For i = LBound(arrSearch) To UBound(arrSearch)
If InStr(1, strTxt, arrSearch(i)) > 0 Then
'[You found a match. Your code here]
End If
Next i
Next page
这仍然很麻烦,因为您必须在新文档中打开每个页面。如果有一种很好的方法来确定您纯粹通过文本处理的页面(例如页面a底部的页码,紧接着是页面b顶部的页眉),那么您可能会看到复制将文档的整个文本分成一个字符串,然后使用文本中的线索确定找到匹配后要提取的页面。我相信这会快得多。
答案 2 :(得分:0)
Sub BatchRenameCS()
Dim objApp As Object
Dim objPDDoc As Object
Dim objjso As Object
Dim newPDF As Acrobat.CAcroPDDoc
Dim lastrow2 As Long
Dim strFileName As String
Dim Folder As String
Dim Page As Long
Dim Cell As Long
Dim PDFCharacterCount() As Long
Dim CharacterCount As Long
Dim i As Integer
Dim c As Integer
Dim x As Integer
Dim strSource As String
Dim strResult As String
Dim PDFCharacters As String
Dim PDFCharacters2 As String
Dim PDFPasteData() As String
Dim PasteDataPage As Integer
Dim LastRow As Long
Dim NewName As String
Dim NewNamePageNum As Integer
Dim Check()
Sheets("Sheet1").Range("C:D").ClearContents
strFileName = selectFile()
Folder = GetFolder()
'create array with pdf word count
Set objApp = CreateObject("AcroExch.App")
Set objPDDoc = CreateObject("AcroExch.PDDoc")
'AD.1 open file, if =false file is damage
If objPDDoc.Open(strFileName) Then
Set objjso = objPDDoc.GetJSObject
ReDim PDFCharacterCount(1 To objPDDoc.GetNumPages) As Long
For Page = 1 To objPDDoc.GetNumPages
PDFCharacters = ""
PDFCharacters2 = ""
For c = 0 To objjso.GetPageNumWords(Page - 1)
PDFCharacters = PDFCharacters & objjso.getPageNthWord(Page - 1, c)
Next c
For i = 1 To Len(PDFCharacters)
Select Case Asc(Mid(PDFCharacters, i, 1))
Case 48 To 57, 65 To 90, 97 To 122:
PDFCharacters2 = PDFCharacters2 & (Mid(PDFCharacters, i, 1))
Case Else
PDFCharacters2 = PDFCharacters2 & ""
End Select
Next
PDFCharacterCount(Page) = Len(PDFCharacters2)
Next Page
lastrow2 = Sheets("Sheet2").Cells(Rows.Count, "A").End(xlUp).Row
Page = 1
ReDim PDFPasteData(1 To objPDDoc.GetNumPages) As String
For Cell = 1 To lastrow2
strResult = ""
strSource = Sheets("Sheet2").Cells(Cell, 1).Text
PDFPasteData(Page) = PDFPasteData(Page) & " " & strSource
For i = 1 To Len(strSource)
Select Case Asc(Mid(strSource, i, 1))
Case 48 To 57, 65 To 90, 97 To 122:
strResult = strResult & (Mid(strSource, i, 1))
Case Else
strResult = strResult & ""
End Select
Next
CharacterCount = CharacterCount + Len(strResult)
If CharacterCount = PDFCharacterCount(Page) Then
CharacterCount = 0
Page = Page + 1
End If
Next Cell
ReDim Check(2, objPDDoc.GetNumPages)
LastRow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row
For Each LookUpCell In Worksheets("Sheet1").Range("A2:A" & LastRow)
For PasteDataPage = 1 To objPDDoc.GetNumPages
If InStr(PDFPasteData(PasteDataPage), LookUpCell.Value) Then
Check(1, PasteDataPage) = Check(1, PasteDataPage) + 1
Check(2, PasteDataPage) = Check(2, PasteDataPage) & LookUpCell.Offset(0, 1).Value & Chr(10)
If FileExist(Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf") Then
Set newPDF = CreateObject("AcroExch.pdDoc")
NewName = Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf"
newPDF.Open (NewName)
newPDF.InsertPages newPDF.GetNumPages - 1, objPDDoc, PasteDataPage - 1, 1, 0
newPDF.Save 1, NewName
newPDF.Close
Set newPDF = Nothing
Else
Set newPDF = CreateObject("AcroExch.PDDoc")
newPDF.Create
NewName = Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf"
newPDF.InsertPages -1, objPDDoc, PasteDataPage - 1, 1, 0
newPDF.Save 1, NewName
newPDF.Close
Set newPDF = Nothing
End If
End If
Next PasteDataPage
Next LookUpCell
x = 1
For PasteDataPage = 1 To objPDDoc.GetNumPages
If Check(1, PasteDataPage) <> 1 Then
Sheets("Sheet1").Cells(x, 3) = PasteDataPage
Sheets("Sheet1").Cells(x, 4) = Check(2, PasteDataPage)
x = x + 1
End If
Next PasteDataPage
End If
MsgBox "Done"
End Sub
Function FileExist(path As String) As Boolean
If Dir(path) <> vbNullString Then FileExist = True
End Function
Function selectFile()
Dim fd As FileDialog, fileName As String
On Error GoTo ErrorHandler
Set fd = Application.FileDialog(msoFileDialogFilePicker)
fd.AllowMultiSelect = False
If fd.Show = True Then
If fd.SelectedItems(1) <> vbNullString Then
fileName = fd.SelectedItems(1)
End If
Else
'Exit code if no file is selected
End
End If
'Return Selected FileName
selectFile = fileName
Set fd = Nothing
Exit Function
ErrorHandler:
Set fd = Nothing
MsgBox "Error " & Err & ": " & Error(Err)
End Function
Function GetFolder() As String
Dim fldr As FileDialog
Dim sItem As String
Set fldr = Application.FileDialog(msoFileDialogFolderPicker)
With fldr
.Title = "Select the Folder where you want you new PDFs to go"
.AllowMultiSelect = False
.InitialFileName = Application.DefaultFilePath
If .Show <> -1 Then GoTo NextCode
sItem = .SelectedItems(1)
End With
NextCode:
GetFolder = sItem
Set fldr = Nothing
End Function