我正在尝试导入一堆PDF并建立一个语料库。我尝试使用pdfminer,但出现OSError。
我的代码:
import os
BASE = os.path.join(r"C:\Users\dangeph\Desktop\DataScience\PDFMiner")
DOCS = os.path.join(BASE, "data", "docs")
def get_documents(path=DOCS):
for name in os.listdir(path):
if name.endswith('.pdf'):
yield os.path.join(path, name)
print(len(list(get_documents())))
import re
import nltk
import codecs
import string
import subprocess
import unicodedata
CORPUS = os.path.join(BASE, "data", "corpus")
def extract_corpus(docs=DOCS, corpus=CORPUS):
if not os.path.exists(corpus):
os.mkdir(corpus)
for path in get_documents(docs):
document = subprocess.check_output(
['pdf2txt.py', path]
)
document = filter(
lambda char: char in string.printable,
unicodedata.normalize('NFKD', document.decode('utf-8'))
)
fname = os.path.splitext(os.path.basename(path))[0] + ".txt"
outpath = os.path.join(corpus, fname)
with codecs.open(outpath, 'w') as f:
f.write(document)
extract_corpus()
错误:
Traceback (most recent call last)
OSError: [WinError 193] %1 is not a valid Win32 application
回溯:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-7-32049ddef2ab> in <module>
31 f.write(document)
32 # Run the extraction
---> 33 extract_corpus()
<ipython-input-7-32049ddef2ab> in extract_corpus(docs, corpus)
18 # Call the subprocess command (must be on your path)
19 document = subprocess.check_output(
---> 20 ['pdf2txt.py', path]
21 )
22 # Encode UTF-u and remove non-printable characters
C:\ProgramData\Anaconda3\lib\subprocess.py in check_output(timeout, *popenargs, **kwargs)
393
394 return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
--> 395 **kwargs).stdout
396
397
C:\ProgramData\Anaconda3\lib\subprocess.py in run(input, capture_output, timeout, check, *popenargs, **kwargs)
470 kwargs['stderr'] = PIPE
471
--> 472 with Popen(*popenargs, **kwargs) as process:
473 try:
474 stdout, stderr = process.communicate(input, timeout=timeout)
C:\ProgramData\Anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
773 c2pread, c2pwrite,
774 errread, errwrite,
--> 775 restore_signals, start_new_session)
776 except:
777 # Cleanup if the child failed starting.
C:\ProgramData\Anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
1176 env,
1177 os.fspath(cwd) if cwd is not None else None,
-> 1178 startupinfo)
1179 finally:
1180 # Child is launched. Close the parent's copy of those pipe
答案 0 :(得分:3)
您无法直接在subprocess.check_output
上调用pdf2txt.py
,因为Windows不知道如何本地执行.py
。请改用subprocess.check_output(['python', 'pdf2txt.py', path])
。您可以使用sys.executable
来获取当前解释器的可执行文件,而不必使用python
作为硬编码值。