如何以编程方式使用Google文档以编程方式从pdf文件中提取文本?我已经准备好知道还有其他选择,但是,我很好奇是否有可能将谷歌文档用于此类目的。
答案 0 :(得分:3)
当使用python将PDF数据检索到文本数据时,您可以使用Drive API v3实现它。但它需要2个步骤。
在此示例中,使用了Python Quickstart。详细信息为https://developers.google.com/drive/v3/web/quickstart/python。 请阅读“第1步:开启云端硬盘API”和“第2步:安装Google客户端库”。如果你已经认识他们,我很抱歉。
使用以下示例脚本时,请修改如下。
请将以下导入添加到快速入门。
import io
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
请将范围更改为以下。
SCOPES = 'https://www.googleapis.com/auth/drive'
main()
请将快速入门的main()
更改为此。
示例脚本可以将PDF文件转换为TXT文件。但PDF文件中的图像不能是TXT文件。
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
pdffile = 'sample.pdf' # PDF file
txtfile = 'sample.txt' # Text file
mime = 'application/vnd.google-apps.document'
res = service.files().create(
body={
'name': pdffile,
'mimeType': mime
},
media_body=MediaFileUpload(pdffile, mimetype=mime, resumable=True)
).execute()
dl = MediaIoBaseDownload(
io.FileIO(txtfile, 'wb'),
service.files().export_media(fileId=res['id'], mimeType="text/plain")
)
done = False
while done is False:
status, done = dl.next_chunk()
print("Done.")
if __name__ == '__main__':
main()
如果我误解了你的问题,我很抱歉。
脚本添加了快速入门:
from __future__ import print_function
import httplib2
import os
import io
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/drive-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
def get_credentials():
"""Gets valid user credentials from storage.
If nothing has been stored, or if the stored credentials are invalid,
the OAuth2 flow is completed to obtain the new credentials.
Returns:
Credentials, the obtained credential.
"""
credential_path = os.path.join("./", 'drive-python-quickstart.json')
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
if flags:
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatibility with Python 2.6
credentials = tools.run(flow, store)
print('Storing credentials to ' + credential_path)
return credentials
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
pdffile = '../Downloads/sample.pdf' # PDF file
txtfile = '../Downloads/sample.txt' # Text file
mime = 'application/vnd.google-apps.document'
res = service.files().create(
body={
'name': pdffile,
'mimeType': mime
},
media_body=MediaFileUpload(pdffile, mimetype=mime, resumable=True)
).execute()
dl = MediaIoBaseDownload(
io.FileIO(txtfile, 'wb'),
service.files().export_media(fileId=res['id'], mimeType="text/plain")
)
done = False
while done is False:
status, done = dl.next_chunk()
print("Done.")
if __name__ == '__main__':
main()