使用PDFMiner时出现OSError“不是有效的Win32应用程序”

时间:2019-05-20 20:08:28

标签: python pdfminer

我正在尝试导入一堆PDF并建立一个语料库。我尝试使用pdfminer,但出现OSError。

我的代码:

import os
BASE = os.path.join(r"C:\Users\dangeph\Desktop\DataScience\PDFMiner")
DOCS = os.path.join(BASE, "data", "docs")
def get_documents(path=DOCS):
    for name in os.listdir(path):
        if name.endswith('.pdf'):
            yield os.path.join(path, name)
print(len(list(get_documents())))

import re
import nltk
import codecs
import string
import subprocess
import unicodedata
CORPUS = os.path.join(BASE, "data", "corpus")
def extract_corpus(docs=DOCS, corpus=CORPUS):
    if not os.path.exists(corpus):
        os.mkdir(corpus)
    for path in get_documents(docs):
        document = subprocess.check_output(
            ['pdf2txt.py', path]
        )
        document = filter(
            lambda char: char in string.printable,
            unicodedata.normalize('NFKD', document.decode('utf-8'))
        )
        fname = os.path.splitext(os.path.basename(path))[0] + ".txt"
        outpath = os.path.join(corpus, fname)
        with codecs.open(outpath, 'w') as f:
            f.write(document)
extract_corpus()

错误:

Traceback (most recent call last)

OSError: [WinError 193] %1 is not a valid Win32 application

回溯:

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-7-32049ddef2ab> in <module>
     31             f.write(document)
     32 # Run the extraction
---> 33 extract_corpus()

<ipython-input-7-32049ddef2ab> in extract_corpus(docs, corpus)
     18         # Call the subprocess command (must be on your path)
     19         document = subprocess.check_output(
---> 20             ['pdf2txt.py', path]
     21         )
     22         # Encode UTF-u and remove non-printable characters

C:\ProgramData\Anaconda3\lib\subprocess.py in check_output(timeout, *popenargs, **kwargs)
    393 
    394     return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
--> 395                **kwargs).stdout
    396 
    397 

C:\ProgramData\Anaconda3\lib\subprocess.py in run(input, capture_output, timeout, check, *popenargs, **kwargs)
    470         kwargs['stderr'] = PIPE
    471 
--> 472     with Popen(*popenargs, **kwargs) as process:
    473         try:
    474             stdout, stderr = process.communicate(input, timeout=timeout)

C:\ProgramData\Anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
    773                                 c2pread, c2pwrite,
    774                                 errread, errwrite,
--> 775                                 restore_signals, start_new_session)
    776         except:
    777             # Cleanup if the child failed starting.

C:\ProgramData\Anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
   1176                                          env,
   1177                                          os.fspath(cwd) if cwd is not None else None,
-> 1178                                          startupinfo)
   1179             finally:
   1180                 # Child is launched. Close the parent's copy of those pipe

1 个答案:

答案 0 :(得分:3)

您无法直接在subprocess.check_output上调用pdf2txt.py,因为Windows不知道如何本地执行.py。请改用subprocess.check_output(['python', 'pdf2txt.py', path])。您可以使用sys.executable来获取当前解释器的可执行文件,而不必使用python作为硬编码值。