如何在python中处理.doc损坏和受密码保护的.doc文件

时间:2019-05-01 19:48:30

标签: python-3.7 win32com

我有一个包含20,000个.doc / docx文件的文件夹。我需要将所有这些都转换为.pdf。我决定使用python来实现这一目标,并且能够将快速而肮脏的代码组合在一起以实现输出。但是,我不得不忍受这个过程,因为我偶尔会遇到损坏的.doc或.docx文件或受密码保护的文件。在这些情况下,我只想跳过这些文件并继续。我最终不得不找到有问题的文件,然后从文件夹中删除并继续。我已经在win32com文档中进行了挖掘,但是找不到任何东西。下面的代码

    from os import chdir, getcwd, listdir, path
from time import strftime
from win32com import client

def count_files(filetype):
    ''' (str) -> int
    Returns the number of files given a specified file type.
    >>> count_files(".docx")
    11
    '''
    count_files = 0
    for files in listdir(folder):
        if files.endswith(filetype):
            count_files += 1
    return count_files

# Function "check_path" is used to check whether the path the user provided does
# actually exist. The user is prompted for a path until the existence of the
# provided path has been verified.

def check_path(prompt):
    ''' (str) -> str
    Verifies if the provided absolute path does exist.
    '''
    abs_path = raw_input(prompt)
    while path.exists(abs_path) != True:
        print ("\nThe specified path does not exist.")
        abs_path = raw_input(prompt)
    return abs_path    

print ("\n")

folder = "My Absolute Folder Path Here"

# Change the directory.

chdir(folder)

# Count the number of docx and doc files in the specified folder.

num_docx = count_files(".docx")
num_doc = count_files(".doc")

# Check if the number of docx or doc files is equal to 0 (= there are no files
# to convert) and if so stop executing the script. 

if num_docx + num_doc == 0:
    print ("\nThe specified folder does not contain docx or docs files.")
    print (strftime("%H:%M:%S"), "There are no files to convert. BYE, BYE!.")
    exit()
else:
    print ("\nNumber of doc and docx files: ", num_docx + num_doc, "")
    print (strftime("%H:%M:%S"), "Starting to convert files ...")

# Try to open win32com instance. If unsuccessful return an error message.

try:
    word = client.DispatchEx("Word.Application")
    for files in listdir(getcwd()):
        if files.endswith(".docx"):
            new_name = files.replace(".docx", r".pdf")
            in_file = path.abspath(folder + "\\" + files)
            new_file = path.abspath(folder + "\\" + new_name)
            doc = word.Documents.Open(in_file)
            print (strftime("%H:%M:%S"), " docx -> pdf ", path.relpath(new_file))
            doc.SaveAs(new_file, FileFormat = 17)
            doc.Close()
        if files.endswith(".doc"):
            new_name = files.replace(".doc", r".pdf")
            in_file = path.abspath(folder + "\\" + files)
            new_file = path.abspath(folder + "\\" + new_name)
            doc = word.Documents.Open(in_file)
            print (strftime("%H:%M:%S"), " doc  -> pdf ", path.relpath(new_file))
            doc.SaveAs(new_file, FileFormat = 17)
            doc.Close()
except Exception as e:
    print (e)
finally:
    word.Quit()

print ("\n", strftime("%H:%M:%S"), "Finished converting files.")    

# Count the number of pdf files.

num_pdf = count_files(".pdf")   

print ("\nNumber of pdf files: ", num_pdf)

# Check if the number of docx and doc file is equal to the number of files.

if num_docx + num_doc == num_pdf:
    print ("\nNumber of doc and docx files is equal to number of pdf files.")
else:
    print ("\nNumber of doc and docx files is not equal to number of pdf files.")

1 个答案:

答案 0 :(得分:0)

以下代码是我使用的(Excel VBA)创建PDF的代码。这是我能为您提供的最好的帮助。希望对您有所帮助。

Sub WordtoPDF()
'OPEN IN EXCEL
'takes files from a location of particular file type, uses WORD to save them as PDFs to new location

    Dim strOldFileName As String
    Dim strOldPath As String
    Dim strNewFileName As String
    Dim strNewPath As String
    Dim OldType As String
    Dim NewType As String
    Dim AraryFileNames() As String
    Dim Path as String

    Dim coll As Collection
    Set coll = New Collection

'Allows to be used on any folder
Do While strOldPath = "" Or strOldPath = "False"
strOldPath = Application.InputBox("FolderPath containing Original files", "FolderPath eg  C:\temp", "C:\temp", Type:=2)
If Dir(strOldPath, vbDirectory) = "" Then
    MsgBox ("Directory doesn't exist")
    Exit Sub
    Else
    End If
Loop

Do While OldType = "" Or OldType = "False" Or InStr(OldType, ".") <> 0
OldType = Application.InputBox("Original file's filetype", "FolderPath eg  docx", "docx", Type:=2)
Loop

Do While strNewPath = "" Or strOldPath = "False"
strNewPath = Application.InputBox("location of NEW files.", "FolderPath eg  C:\temp", strOldPath, Type:=2)
If Dir(strNewPath, vbDirectory) = "" Then
    MsgBox ("Directory doesn't exist")
    Exit Sub
    Else
    End If
Loop

Do While NewType = "" Or NewType = "False" Or InStr(NewType, ".") <> 0
NewType = Application.InputBox("file type to convert files to", "FolderPath eg  docx becomes pdf", "pdf", Type:=2)
Loop

'AAAAA
'Counts how many files there are in the folder with the ".docx"(OldType) ending and makes a collection of their names
'creates a collection of with only "OldType" filetype.
    Path = strOldPath & "\*." & OldType
    fileName = Dir(Path)
    coll.Add fileName

    Do While fileName <> ""
    Count = Count + 1
    fileName = Dir()
    coll.Add fileName
    Loop

Dim item As Variant
On Error GoTo line1:
For Each item In coll
For i = 1 To coll.Count
If UCase(Right(coll(i), Len(OldType))) <> UCase(OldType) Then
coll.Remove (i)
i = 0
Else
End If
Next i
Next item
line1:
On Error GoTo 0
'AAAAA

'BBBBB
'Checks new location to make sure that files won't be saving over existing files
'collection file names with NewType extension checks to see if unique in new location.
On Error GoTo Error:
For i = 1 To coll.Count
    Path2 = strNewPath & "\*." & NewType
    fileName2 = Dir(Path2)

    Do While fileName2 <> ""
    If UCase(Left(coll(i), Len(coll(i)) - Len(OldType)) & NewType) = UCase(fileName2) Then
    MsgBox (Left(coll(i), Len(coll(i)) - Len(OldType)) & NewType & " already exists in " & strNewPath)
    Exit Sub
    Else
    End If
    fileName2 = Dir()
    Loop
Next i
Error:
On Error GoTo 0
'BBBBB

'CCCCC
'Opens each Old Type file in the original location using word, and saves as PDF with the same name in the new location
Set appWD = CreateObject("Word.Application")
For i = 1 To coll.Count
TempString = strOldPath & "\" & Left(coll(i), Len(coll(i)) - Len(OldType) - 1) & "." & OldType
Set objDoc = appWD.Documents.Open(fileName:=TempString)

TempString2 = strNewPath & "\" & Left(coll(i), Len(coll(i)) - Len(OldType) - 1) & "." & NewType
objDoc.ExportAsFixedFormat OutputFileName:=TempString2, ExportFormat:=17 '17 = wdExportFormatPDF
Next i
'CCCCC


    If Not appWD Is Nothing Then
        appWD.Quit
        Set appWD = Nothing
    End If

End Sub