我有一个包含20,000个.doc / docx文件的文件夹。我需要将所有这些都转换为.pdf。我决定使用python来实现这一目标,并且能够将快速而肮脏的代码组合在一起以实现输出。但是,我不得不忍受这个过程,因为我偶尔会遇到损坏的.doc或.docx文件或受密码保护的文件。在这些情况下,我只想跳过这些文件并继续。我最终不得不找到有问题的文件,然后从文件夹中删除并继续。我已经在win32com文档中进行了挖掘,但是找不到任何东西。下面的代码
from os import chdir, getcwd, listdir, path
from time import strftime
from win32com import client
def count_files(filetype):
''' (str) -> int
Returns the number of files given a specified file type.
>>> count_files(".docx")
11
'''
count_files = 0
for files in listdir(folder):
if files.endswith(filetype):
count_files += 1
return count_files
# Function "check_path" is used to check whether the path the user provided does
# actually exist. The user is prompted for a path until the existence of the
# provided path has been verified.
def check_path(prompt):
''' (str) -> str
Verifies if the provided absolute path does exist.
'''
abs_path = raw_input(prompt)
while path.exists(abs_path) != True:
print ("\nThe specified path does not exist.")
abs_path = raw_input(prompt)
return abs_path
print ("\n")
folder = "My Absolute Folder Path Here"
# Change the directory.
chdir(folder)
# Count the number of docx and doc files in the specified folder.
num_docx = count_files(".docx")
num_doc = count_files(".doc")
# Check if the number of docx or doc files is equal to 0 (= there are no files
# to convert) and if so stop executing the script.
if num_docx + num_doc == 0:
print ("\nThe specified folder does not contain docx or docs files.")
print (strftime("%H:%M:%S"), "There are no files to convert. BYE, BYE!.")
exit()
else:
print ("\nNumber of doc and docx files: ", num_docx + num_doc, "")
print (strftime("%H:%M:%S"), "Starting to convert files ...")
# Try to open win32com instance. If unsuccessful return an error message.
try:
word = client.DispatchEx("Word.Application")
for files in listdir(getcwd()):
if files.endswith(".docx"):
new_name = files.replace(".docx", r".pdf")
in_file = path.abspath(folder + "\\" + files)
new_file = path.abspath(folder + "\\" + new_name)
doc = word.Documents.Open(in_file)
print (strftime("%H:%M:%S"), " docx -> pdf ", path.relpath(new_file))
doc.SaveAs(new_file, FileFormat = 17)
doc.Close()
if files.endswith(".doc"):
new_name = files.replace(".doc", r".pdf")
in_file = path.abspath(folder + "\\" + files)
new_file = path.abspath(folder + "\\" + new_name)
doc = word.Documents.Open(in_file)
print (strftime("%H:%M:%S"), " doc -> pdf ", path.relpath(new_file))
doc.SaveAs(new_file, FileFormat = 17)
doc.Close()
except Exception as e:
print (e)
finally:
word.Quit()
print ("\n", strftime("%H:%M:%S"), "Finished converting files.")
# Count the number of pdf files.
num_pdf = count_files(".pdf")
print ("\nNumber of pdf files: ", num_pdf)
# Check if the number of docx and doc file is equal to the number of files.
if num_docx + num_doc == num_pdf:
print ("\nNumber of doc and docx files is equal to number of pdf files.")
else:
print ("\nNumber of doc and docx files is not equal to number of pdf files.")
答案 0 :(得分:0)
以下代码是我使用的(Excel VBA)创建PDF的代码。这是我能为您提供的最好的帮助。希望对您有所帮助。
Sub WordtoPDF()
'OPEN IN EXCEL
'takes files from a location of particular file type, uses WORD to save them as PDFs to new location
Dim strOldFileName As String
Dim strOldPath As String
Dim strNewFileName As String
Dim strNewPath As String
Dim OldType As String
Dim NewType As String
Dim AraryFileNames() As String
Dim Path as String
Dim coll As Collection
Set coll = New Collection
'Allows to be used on any folder
Do While strOldPath = "" Or strOldPath = "False"
strOldPath = Application.InputBox("FolderPath containing Original files", "FolderPath eg C:\temp", "C:\temp", Type:=2)
If Dir(strOldPath, vbDirectory) = "" Then
MsgBox ("Directory doesn't exist")
Exit Sub
Else
End If
Loop
Do While OldType = "" Or OldType = "False" Or InStr(OldType, ".") <> 0
OldType = Application.InputBox("Original file's filetype", "FolderPath eg docx", "docx", Type:=2)
Loop
Do While strNewPath = "" Or strOldPath = "False"
strNewPath = Application.InputBox("location of NEW files.", "FolderPath eg C:\temp", strOldPath, Type:=2)
If Dir(strNewPath, vbDirectory) = "" Then
MsgBox ("Directory doesn't exist")
Exit Sub
Else
End If
Loop
Do While NewType = "" Or NewType = "False" Or InStr(NewType, ".") <> 0
NewType = Application.InputBox("file type to convert files to", "FolderPath eg docx becomes pdf", "pdf", Type:=2)
Loop
'AAAAA
'Counts how many files there are in the folder with the ".docx"(OldType) ending and makes a collection of their names
'creates a collection of with only "OldType" filetype.
Path = strOldPath & "\*." & OldType
fileName = Dir(Path)
coll.Add fileName
Do While fileName <> ""
Count = Count + 1
fileName = Dir()
coll.Add fileName
Loop
Dim item As Variant
On Error GoTo line1:
For Each item In coll
For i = 1 To coll.Count
If UCase(Right(coll(i), Len(OldType))) <> UCase(OldType) Then
coll.Remove (i)
i = 0
Else
End If
Next i
Next item
line1:
On Error GoTo 0
'AAAAA
'BBBBB
'Checks new location to make sure that files won't be saving over existing files
'collection file names with NewType extension checks to see if unique in new location.
On Error GoTo Error:
For i = 1 To coll.Count
Path2 = strNewPath & "\*." & NewType
fileName2 = Dir(Path2)
Do While fileName2 <> ""
If UCase(Left(coll(i), Len(coll(i)) - Len(OldType)) & NewType) = UCase(fileName2) Then
MsgBox (Left(coll(i), Len(coll(i)) - Len(OldType)) & NewType & " already exists in " & strNewPath)
Exit Sub
Else
End If
fileName2 = Dir()
Loop
Next i
Error:
On Error GoTo 0
'BBBBB
'CCCCC
'Opens each Old Type file in the original location using word, and saves as PDF with the same name in the new location
Set appWD = CreateObject("Word.Application")
For i = 1 To coll.Count
TempString = strOldPath & "\" & Left(coll(i), Len(coll(i)) - Len(OldType) - 1) & "." & OldType
Set objDoc = appWD.Documents.Open(fileName:=TempString)
TempString2 = strNewPath & "\" & Left(coll(i), Len(coll(i)) - Len(OldType) - 1) & "." & NewType
objDoc.ExportAsFixedFormat OutputFileName:=TempString2, ExportFormat:=17 '17 = wdExportFormatPDF
Next i
'CCCCC
If Not appWD Is Nothing Then
appWD.Quit
Set appWD = Nothing
End If
End Sub